Refactor the JNI interface to make it more modular and maintainable (#802)

This commit is contained in:
Fangjun Kuang
2024-04-24 09:48:42 +08:00
committed by GitHub
parent dc5af04830
commit 9b67a476e6
116 changed files with 3502 additions and 3316 deletions

View File

@@ -1 +1 @@
../android/SherpaOnnxAudioTagging/app/src/main/java/com/k2fsa/sherpa/onnx/audio/tagging/AudioTagging.kt
../sherpa-onnx/kotlin-api/AudioTagging.kt

View File

@@ -0,0 +1 @@
../sherpa-onnx/kotlin-api/FeatureConfig.kt

View File

@@ -1,245 +0,0 @@
package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
fun callback(samples: FloatArray): Unit {
println("callback got called with ${samples.size} samples");
}
fun main() {
testSpokenLanguageIdentifcation()
testAudioTagging()
testSpeakerRecognition()
testTts()
testAsr("transducer")
testAsr("zipformer2-ctc")
}
fun testSpokenLanguageIdentifcation() {
val config = SpokenLanguageIdentificationConfig(
whisper = SpokenLanguageIdentificationWhisperConfig(
encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx",
decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx",
tailPaddings = 33,
),
numThreads=1,
debug=true,
provider="cpu",
)
val slid = SpokenLanguageIdentification(assetManager=null, config=config)
val testFiles = arrayOf(
"./spoken-language-identification-test-wavs/ar-arabic.wav",
"./spoken-language-identification-test-wavs/bg-bulgarian.wav",
"./spoken-language-identification-test-wavs/de-german.wav",
)
for (waveFilename in testFiles) {
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
val stream = slid.createStream()
stream.acceptWaveform(samples, sampleRate = sampleRate)
val lang = slid.compute(stream)
stream.release()
println(waveFilename)
println(lang)
}
}
fun testAudioTagging() {
val config = AudioTaggingConfig(
model=AudioTaggingModelConfig(
zipformer=OfflineZipformerAudioTaggingModelConfig(
model="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx",
),
numThreads=1,
debug=true,
provider="cpu",
),
labels="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv",
topK=5,
)
val tagger = AudioTagging(assetManager=null, config=config)
val testFiles = arrayOf(
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/2.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/3.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/4.wav",
)
println("----------")
for (waveFilename in testFiles) {
val stream = tagger.createStream()
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
stream.acceptWaveform(samples, sampleRate = sampleRate)
val events = tagger.compute(stream)
stream.release()
println(waveFilename)
println(events)
println("----------")
}
tagger.release()
}
fun computeEmbedding(extractor: SpeakerEmbeddingExtractor, filename: String): FloatArray {
var objArray = WaveReader.readWaveFromFile(
filename = filename,
)
var samples: FloatArray = objArray[0] as FloatArray
var sampleRate: Int = objArray[1] as Int
val stream = extractor.createStream()
stream.acceptWaveform(sampleRate = sampleRate, samples=samples)
stream.inputFinished()
check(extractor.isReady(stream))
val embedding = extractor.compute(stream)
stream.release()
return embedding
}
fun testSpeakerRecognition() {
val config = SpeakerEmbeddingExtractorConfig(
model="./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx",
)
val extractor = SpeakerEmbeddingExtractor(config = config)
val embedding1a = computeEmbedding(extractor, "./speaker1_a_cn_16k.wav")
val embedding2a = computeEmbedding(extractor, "./speaker2_a_cn_16k.wav")
val embedding1b = computeEmbedding(extractor, "./speaker1_b_cn_16k.wav")
var manager = SpeakerEmbeddingManager(extractor.dim())
var ok = manager.add(name = "speaker1", embedding=embedding1a)
check(ok)
manager.add(name = "speaker2", embedding=embedding2a)
check(ok)
var name = manager.search(embedding=embedding1b, threshold=0.5f)
check(name == "speaker1")
manager.release()
manager = SpeakerEmbeddingManager(extractor.dim())
val embeddingList = mutableListOf(embedding1a, embedding1b)
ok = manager.add(name = "s1", embedding=embeddingList.toTypedArray())
check(ok)
name = manager.search(embedding=embedding1b, threshold=0.5f)
check(name == "s1")
name = manager.search(embedding=embedding2a, threshold=0.5f)
check(name.length == 0)
manager.release()
}
fun testTts() {
// see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
var config = OfflineTtsConfig(
model=OfflineTtsModelConfig(
vits=OfflineTtsVitsModelConfig(
model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx",
tokens="./vits-piper-en_US-amy-low/tokens.txt",
dataDir="./vits-piper-en_US-amy-low/espeak-ng-data",
),
numThreads=1,
debug=true,
)
)
val tts = OfflineTts(config=config)
val audio = tts.generateWithCallback(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”", callback=::callback)
audio.save(filename="test-en.wav")
}
fun testAsr(type: String) {
var featConfig = FeatureConfig(
sampleRate = 16000,
featureDim = 80,
)
var waveFilename: String
var modelConfig: OnlineModelConfig = when (type) {
"transducer" -> {
waveFilename = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav"
// please refer to
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
// to dowload pre-trained models
OnlineModelConfig(
transducer = OnlineTransducerModelConfig(
encoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx",
decoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx",
joiner = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx",
),
tokens = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt",
numThreads = 1,
debug = false,
)
}
"zipformer2-ctc" -> {
waveFilename = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav"
OnlineModelConfig(
zipformer2Ctc = OnlineZipformer2CtcModelConfig(
model = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx",
),
tokens = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt",
numThreads = 1,
debug = false,
)
}
else -> throw IllegalArgumentException(type)
}
var endpointConfig = EndpointConfig()
var lmConfig = OnlineLMConfig()
var config = OnlineRecognizerConfig(
modelConfig = modelConfig,
lmConfig = lmConfig,
featConfig = featConfig,
endpointConfig = endpointConfig,
enableEndpoint = true,
decodingMethod = "greedy_search",
maxActivePaths = 4,
)
var model = SherpaOnnx(
config = config,
)
var objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
var samples: FloatArray = objArray[0] as FloatArray
var sampleRate: Int = objArray[1] as Int
model.acceptWaveform(samples, sampleRate = sampleRate)
while (model.isReady()) {
model.decode()
}
var tailPaddings = FloatArray((sampleRate * 0.5).toInt()) // 0.5 seconds
model.acceptWaveform(tailPaddings, sampleRate = sampleRate)
model.inputFinished()
while (model.isReady()) {
model.decode()
}
println("results: ${model.text}")
}

View File

@@ -0,0 +1 @@
../sherpa-onnx/kotlin-api/OfflineRecognizer.kt

View File

@@ -1 +1 @@
../android/SherpaOnnxAudioTagging/app/src/main/java/com/k2fsa/sherpa/onnx/audio/tagging/OfflineStream.kt
../sherpa-onnx/kotlin-api/OfflineStream.kt

View File

@@ -0,0 +1 @@
../sherpa-onnx/kotlin-api/OnlineRecognizer.kt

View File

@@ -0,0 +1 @@
../sherpa-onnx/kotlin-api/OnlineStream.kt

View File

@@ -1 +0,0 @@
../android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt

View File

@@ -1 +0,0 @@
../android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt

View File

@@ -1 +1 @@
../android/SherpaOnnxSpeakerIdentification/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/identification/Speaker.kt
../sherpa-onnx/kotlin-api/Speaker.kt

View File

@@ -1 +1 @@
../android/SherpaOnnxSpokenLanguageIdentification/app/src/main/java/com/k2fsa/sherpa/onnx/slid/SpokenLanguageIdentification.kt
../sherpa-onnx/kotlin-api/SpokenLanguageIdentification.kt

View File

@@ -1 +1 @@
../android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
../sherpa-onnx/kotlin-api/Vad.kt

View File

@@ -1 +1 @@
../android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/WaveReader.kt
../sherpa-onnx/kotlin-api/WaveReader.kt

View File

@@ -44,9 +44,23 @@ function testSpeakerEmbeddingExtractor() {
if [ ! -f ./speaker2_a_cn_16k.wav ]; then
curl -SL -O https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav
fi
out_filename=test_speaker_id.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_speaker_id.kt \
OnlineStream.kt \
Speaker.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
function testAsr() {
function testOnlineAsr() {
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git lfs install
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
@@ -57,6 +71,20 @@ function testAsr() {
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
fi
out_filename=test_online_asr.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_online_asr.kt \
FeatureConfig.kt \
OnlineRecognizer.kt \
OnlineStream.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
function testTts() {
@@ -65,16 +93,42 @@ function testTts() {
tar xf vits-piper-en_US-amy-low.tar.bz2
rm vits-piper-en_US-amy-low.tar.bz2
fi
out_filename=test_tts.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_tts.kt \
Tts.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
function testAudioTagging() {
if [ ! -d sherpa-onnx-zipformer-audio-tagging-2024-04-09 ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
fi
out_filename=test_audio_tagging.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_audio_tagging.kt \
AudioTagging.kt \
OfflineStream.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
function testSpokenLanguageIdentification() {
if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
@@ -87,50 +141,44 @@ function testSpokenLanguageIdentification() {
tar xvf spoken-language-identification-test-wavs.tar.bz2
rm spoken-language-identification-test-wavs.tar.bz2
fi
out_filename=test_language_id.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_language_id.kt \
SpokenLanguageIdentification.kt \
OfflineStream.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
function test() {
testSpokenLanguageIdentification
testAudioTagging
testSpeakerEmbeddingExtractor
testAsr
testTts
}
test
kotlinc-jvm -include-runtime -d main.jar \
AudioTagging.kt \
Main.kt \
OfflineStream.kt \
SherpaOnnx.kt \
Speaker.kt \
SpokenLanguageIdentification.kt \
Tts.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh main.jar
java -Djava.library.path=../build/lib -jar main.jar
function testTwoPass() {
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
fi
function testOfflineAsr() {
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
fi
kotlinc-jvm -include-runtime -d 2pass.jar test-2pass.kt WaveReader.kt SherpaOnnx2Pass.kt faked-asset-manager.kt
ls -lh 2pass.jar
java -Djava.library.path=../build/lib -jar 2pass.jar
out_filename=test_offline_asr.jar
kotlinc-jvm -include-runtime -d $out_filename \
test_offline_asr.kt \
FeatureConfig.kt \
OfflineRecognizer.kt \
OfflineStream.kt \
WaveReader.kt \
faked-asset-manager.kt
ls -lh $out_filename
java -Djava.library.path=../build/lib -jar $out_filename
}
testTwoPass
testSpeakerEmbeddingExtractor
testOnlineAsr
testTts
testAudioTagging
testSpokenLanguageIdentification
testOfflineAsr

View File

@@ -1,49 +0,0 @@
package com.k2fsa.sherpa.onnx
fun main() {
test2Pass()
}
fun test2Pass() {
val firstPass = createFirstPass()
val secondPass = createSecondPass()
val waveFilename = "./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"
var objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
var samples: FloatArray = objArray[0] as FloatArray
var sampleRate: Int = objArray[1] as Int
firstPass.acceptWaveform(samples, sampleRate = sampleRate)
while (firstPass.isReady()) {
firstPass.decode()
}
var text = firstPass.text
println("First pass text: $text")
text = secondPass.decode(samples, sampleRate)
println("Second pass text: $text")
}
fun createFirstPass(): SherpaOnnx {
val config = OnlineRecognizerConfig(
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
modelConfig = getModelConfig(type = 1)!!,
endpointConfig = getEndpointConfig(),
enableEndpoint = true,
)
return SherpaOnnx(config = config)
}
fun createSecondPass(): SherpaOnnxOffline {
val config = OfflineRecognizerConfig(
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
modelConfig = getOfflineModelConfig(type = 2)!!,
)
return SherpaOnnxOffline(config = config)
}

View File

@@ -0,0 +1,49 @@
package com.k2fsa.sherpa.onnx
fun main() {
testAudioTagging()
}
fun testAudioTagging() {
val config = AudioTaggingConfig(
model=AudioTaggingModelConfig(
zipformer=OfflineZipformerAudioTaggingModelConfig(
model="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx",
),
numThreads=1,
debug=true,
provider="cpu",
),
labels="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv",
topK=5,
)
val tagger = AudioTagging(config=config)
val testFiles = arrayOf(
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/2.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/3.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/4.wav",
)
println("----------")
for (waveFilename in testFiles) {
val stream = tagger.createStream()
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
stream.acceptWaveform(samples, sampleRate = sampleRate)
val events = tagger.compute(stream)
stream.release()
println(waveFilename)
println(events)
println("----------")
}
tagger.release()
}

View File

@@ -0,0 +1,43 @@
package com.k2fsa.sherpa.onnx
fun main() {
testSpokenLanguageIdentifcation()
}
fun testSpokenLanguageIdentifcation() {
val config = SpokenLanguageIdentificationConfig(
whisper = SpokenLanguageIdentificationWhisperConfig(
encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx",
decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx",
tailPaddings = 33,
),
numThreads=1,
debug=true,
provider="cpu",
)
val slid = SpokenLanguageIdentification(config=config)
val testFiles = arrayOf(
"./spoken-language-identification-test-wavs/ar-arabic.wav",
"./spoken-language-identification-test-wavs/bg-bulgarian.wav",
"./spoken-language-identification-test-wavs/de-german.wav",
)
for (waveFilename in testFiles) {
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
val stream = slid.createStream()
stream.acceptWaveform(samples, sampleRate = sampleRate)
val lang = slid.compute(stream)
stream.release()
println(waveFilename)
println(lang)
}
slid.release()
}

View File

@@ -0,0 +1,32 @@
package com.k2fsa.sherpa.onnx
fun main() {
val recognizer = createOfflineRecognizer()
val waveFilename = "./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
val stream = recognizer.createStream()
stream.acceptWaveform(samples, sampleRate=sampleRate)
recognizer.decode(stream)
val result = recognizer.getResult(stream)
println(result)
stream.release()
recognizer.release()
}
fun createOfflineRecognizer(): OfflineRecognizer {
val config = OfflineRecognizerConfig(
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
modelConfig = getOfflineModelConfig(type = 2)!!,
)
return OfflineRecognizer(config = config)
}

View File

@@ -0,0 +1,87 @@
package com.k2fsa.sherpa.onnx
fun main() {
testOnlineAsr("transducer")
testOnlineAsr("zipformer2-ctc")
}
fun testOnlineAsr(type: String) {
val featConfig = FeatureConfig(
sampleRate = 16000,
featureDim = 80,
)
val waveFilename: String
val modelConfig: OnlineModelConfig = when (type) {
"transducer" -> {
waveFilename = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav"
// please refer to
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
// to dowload pre-trained models
OnlineModelConfig(
transducer = OnlineTransducerModelConfig(
encoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx",
decoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx",
joiner = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx",
),
tokens = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt",
numThreads = 1,
debug = false,
)
}
"zipformer2-ctc" -> {
waveFilename = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav"
OnlineModelConfig(
zipformer2Ctc = OnlineZipformer2CtcModelConfig(
model = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx",
),
tokens = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt",
numThreads = 1,
debug = false,
)
}
else -> throw IllegalArgumentException(type)
}
val endpointConfig = EndpointConfig()
val lmConfig = OnlineLMConfig()
val config = OnlineRecognizerConfig(
modelConfig = modelConfig,
lmConfig = lmConfig,
featConfig = featConfig,
endpointConfig = endpointConfig,
enableEndpoint = true,
decodingMethod = "greedy_search",
maxActivePaths = 4,
)
val recognizer = OnlineRecognizer(
config = config,
)
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
val stream = recognizer.createStream()
stream.acceptWaveform(samples, sampleRate = sampleRate)
while (recognizer.isReady(stream)) {
recognizer.decode(stream)
}
val tailPaddings = FloatArray((sampleRate * 0.5).toInt()) // 0.5 seconds
stream.acceptWaveform(tailPaddings, sampleRate = sampleRate)
stream.inputFinished()
while (recognizer.isReady(stream)) {
recognizer.decode(stream)
}
println("results: ${recognizer.getResult(stream).text}")
stream.release()
recognizer.release()
}

View File

@@ -0,0 +1,62 @@
package com.k2fsa.sherpa.onnx
fun main() {
testSpeakerRecognition()
}
fun testSpeakerRecognition() {
val config = SpeakerEmbeddingExtractorConfig(
model="./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx",
)
val extractor = SpeakerEmbeddingExtractor(config = config)
val embedding1a = computeEmbedding(extractor, "./speaker1_a_cn_16k.wav")
val embedding2a = computeEmbedding(extractor, "./speaker2_a_cn_16k.wav")
val embedding1b = computeEmbedding(extractor, "./speaker1_b_cn_16k.wav")
var manager = SpeakerEmbeddingManager(extractor.dim())
var ok = manager.add(name = "speaker1", embedding=embedding1a)
check(ok)
manager.add(name = "speaker2", embedding=embedding2a)
check(ok)
var name = manager.search(embedding=embedding1b, threshold=0.5f)
check(name == "speaker1")
manager.release()
manager = SpeakerEmbeddingManager(extractor.dim())
val embeddingList = mutableListOf(embedding1a, embedding1b)
ok = manager.add(name = "s1", embedding=embeddingList.toTypedArray())
check(ok)
name = manager.search(embedding=embedding1b, threshold=0.5f)
check(name == "s1")
name = manager.search(embedding=embedding2a, threshold=0.5f)
check(name.length == 0)
manager.release()
extractor.release()
println("Speaker ID test done!")
}
fun computeEmbedding(extractor: SpeakerEmbeddingExtractor, filename: String): FloatArray {
var objArray = WaveReader.readWaveFromFile(
filename = filename,
)
var samples: FloatArray = objArray[0] as FloatArray
var sampleRate: Int = objArray[1] as Int
val stream = extractor.createStream()
stream.acceptWaveform(sampleRate = sampleRate, samples=samples)
stream.inputFinished()
check(extractor.isReady(stream))
val embedding = extractor.compute(stream)
stream.release()
return embedding
}

View File

@@ -0,0 +1,30 @@
package com.k2fsa.sherpa.onnx
fun main() {
testTts()
}
fun testTts() {
// see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
var config = OfflineTtsConfig(
model=OfflineTtsModelConfig(
vits=OfflineTtsVitsModelConfig(
model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx",
tokens="./vits-piper-en_US-amy-low/tokens.txt",
dataDir="./vits-piper-en_US-amy-low/espeak-ng-data",
),
numThreads=1,
debug=true,
)
)
val tts = OfflineTts(config=config)
val audio = tts.generateWithCallback(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”", callback=::callback)
audio.save(filename="test-en.wav")
tts.release()
println("Saved to test-en.wav")
}
fun callback(samples: FloatArray): Unit {
println("callback got called with ${samples.size} samples");
}