Refactor the JNI interface to make it more modular and maintainable (#802)
This commit is contained in:
@@ -1 +1 @@
|
||||
../android/SherpaOnnxAudioTagging/app/src/main/java/com/k2fsa/sherpa/onnx/audio/tagging/AudioTagging.kt
|
||||
../sherpa-onnx/kotlin-api/AudioTagging.kt
|
||||
1
kotlin-api-examples/FeatureConfig.kt
Symbolic link
1
kotlin-api-examples/FeatureConfig.kt
Symbolic link
@@ -0,0 +1 @@
|
||||
../sherpa-onnx/kotlin-api/FeatureConfig.kt
|
||||
@@ -1,245 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
fun callback(samples: FloatArray): Unit {
|
||||
println("callback got called with ${samples.size} samples");
|
||||
}
|
||||
|
||||
fun main() {
|
||||
testSpokenLanguageIdentifcation()
|
||||
testAudioTagging()
|
||||
testSpeakerRecognition()
|
||||
testTts()
|
||||
testAsr("transducer")
|
||||
testAsr("zipformer2-ctc")
|
||||
}
|
||||
|
||||
fun testSpokenLanguageIdentifcation() {
|
||||
val config = SpokenLanguageIdentificationConfig(
|
||||
whisper = SpokenLanguageIdentificationWhisperConfig(
|
||||
encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx",
|
||||
decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx",
|
||||
tailPaddings = 33,
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
provider="cpu",
|
||||
)
|
||||
val slid = SpokenLanguageIdentification(assetManager=null, config=config)
|
||||
|
||||
val testFiles = arrayOf(
|
||||
"./spoken-language-identification-test-wavs/ar-arabic.wav",
|
||||
"./spoken-language-identification-test-wavs/bg-bulgarian.wav",
|
||||
"./spoken-language-identification-test-wavs/de-german.wav",
|
||||
)
|
||||
|
||||
for (waveFilename in testFiles) {
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = slid.createStream()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
val lang = slid.compute(stream)
|
||||
stream.release()
|
||||
println(waveFilename)
|
||||
println(lang)
|
||||
}
|
||||
}
|
||||
|
||||
fun testAudioTagging() {
|
||||
val config = AudioTaggingConfig(
|
||||
model=AudioTaggingModelConfig(
|
||||
zipformer=OfflineZipformerAudioTaggingModelConfig(
|
||||
model="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx",
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
provider="cpu",
|
||||
),
|
||||
labels="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv",
|
||||
topK=5,
|
||||
)
|
||||
val tagger = AudioTagging(assetManager=null, config=config)
|
||||
|
||||
val testFiles = arrayOf(
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/2.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/3.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/4.wav",
|
||||
)
|
||||
println("----------")
|
||||
for (waveFilename in testFiles) {
|
||||
val stream = tagger.createStream()
|
||||
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
val events = tagger.compute(stream)
|
||||
stream.release()
|
||||
|
||||
println(waveFilename)
|
||||
println(events)
|
||||
println("----------")
|
||||
}
|
||||
|
||||
tagger.release()
|
||||
}
|
||||
|
||||
fun computeEmbedding(extractor: SpeakerEmbeddingExtractor, filename: String): FloatArray {
|
||||
var objArray = WaveReader.readWaveFromFile(
|
||||
filename = filename,
|
||||
)
|
||||
var samples: FloatArray = objArray[0] as FloatArray
|
||||
var sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = extractor.createStream()
|
||||
stream.acceptWaveform(sampleRate = sampleRate, samples=samples)
|
||||
stream.inputFinished()
|
||||
check(extractor.isReady(stream))
|
||||
|
||||
val embedding = extractor.compute(stream)
|
||||
|
||||
stream.release()
|
||||
|
||||
return embedding
|
||||
}
|
||||
|
||||
fun testSpeakerRecognition() {
|
||||
val config = SpeakerEmbeddingExtractorConfig(
|
||||
model="./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx",
|
||||
)
|
||||
val extractor = SpeakerEmbeddingExtractor(config = config)
|
||||
|
||||
val embedding1a = computeEmbedding(extractor, "./speaker1_a_cn_16k.wav")
|
||||
val embedding2a = computeEmbedding(extractor, "./speaker2_a_cn_16k.wav")
|
||||
val embedding1b = computeEmbedding(extractor, "./speaker1_b_cn_16k.wav")
|
||||
|
||||
var manager = SpeakerEmbeddingManager(extractor.dim())
|
||||
var ok = manager.add(name = "speaker1", embedding=embedding1a)
|
||||
check(ok)
|
||||
|
||||
manager.add(name = "speaker2", embedding=embedding2a)
|
||||
check(ok)
|
||||
|
||||
var name = manager.search(embedding=embedding1b, threshold=0.5f)
|
||||
check(name == "speaker1")
|
||||
|
||||
manager.release()
|
||||
|
||||
manager = SpeakerEmbeddingManager(extractor.dim())
|
||||
val embeddingList = mutableListOf(embedding1a, embedding1b)
|
||||
ok = manager.add(name = "s1", embedding=embeddingList.toTypedArray())
|
||||
check(ok)
|
||||
|
||||
name = manager.search(embedding=embedding1b, threshold=0.5f)
|
||||
check(name == "s1")
|
||||
|
||||
name = manager.search(embedding=embedding2a, threshold=0.5f)
|
||||
check(name.length == 0)
|
||||
|
||||
manager.release()
|
||||
}
|
||||
|
||||
fun testTts() {
|
||||
// see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
var config = OfflineTtsConfig(
|
||||
model=OfflineTtsModelConfig(
|
||||
vits=OfflineTtsVitsModelConfig(
|
||||
model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx",
|
||||
tokens="./vits-piper-en_US-amy-low/tokens.txt",
|
||||
dataDir="./vits-piper-en_US-amy-low/espeak-ng-data",
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
)
|
||||
)
|
||||
val tts = OfflineTts(config=config)
|
||||
val audio = tts.generateWithCallback(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”", callback=::callback)
|
||||
audio.save(filename="test-en.wav")
|
||||
}
|
||||
|
||||
fun testAsr(type: String) {
|
||||
var featConfig = FeatureConfig(
|
||||
sampleRate = 16000,
|
||||
featureDim = 80,
|
||||
)
|
||||
|
||||
var waveFilename: String
|
||||
var modelConfig: OnlineModelConfig = when (type) {
|
||||
"transducer" -> {
|
||||
waveFilename = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav"
|
||||
// please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// to dowload pre-trained models
|
||||
OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx",
|
||||
decoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
tokens = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt",
|
||||
numThreads = 1,
|
||||
debug = false,
|
||||
)
|
||||
}
|
||||
"zipformer2-ctc" -> {
|
||||
waveFilename = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav"
|
||||
OnlineModelConfig(
|
||||
zipformer2Ctc = OnlineZipformer2CtcModelConfig(
|
||||
model = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx",
|
||||
),
|
||||
tokens = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt",
|
||||
numThreads = 1,
|
||||
debug = false,
|
||||
)
|
||||
}
|
||||
else -> throw IllegalArgumentException(type)
|
||||
}
|
||||
|
||||
var endpointConfig = EndpointConfig()
|
||||
|
||||
var lmConfig = OnlineLMConfig()
|
||||
|
||||
var config = OnlineRecognizerConfig(
|
||||
modelConfig = modelConfig,
|
||||
lmConfig = lmConfig,
|
||||
featConfig = featConfig,
|
||||
endpointConfig = endpointConfig,
|
||||
enableEndpoint = true,
|
||||
decodingMethod = "greedy_search",
|
||||
maxActivePaths = 4,
|
||||
)
|
||||
|
||||
var model = SherpaOnnx(
|
||||
config = config,
|
||||
)
|
||||
|
||||
var objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
var samples: FloatArray = objArray[0] as FloatArray
|
||||
var sampleRate: Int = objArray[1] as Int
|
||||
|
||||
model.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
while (model.isReady()) {
|
||||
model.decode()
|
||||
}
|
||||
|
||||
var tailPaddings = FloatArray((sampleRate * 0.5).toInt()) // 0.5 seconds
|
||||
model.acceptWaveform(tailPaddings, sampleRate = sampleRate)
|
||||
model.inputFinished()
|
||||
while (model.isReady()) {
|
||||
model.decode()
|
||||
}
|
||||
|
||||
println("results: ${model.text}")
|
||||
}
|
||||
1
kotlin-api-examples/OfflineRecognizer.kt
Symbolic link
1
kotlin-api-examples/OfflineRecognizer.kt
Symbolic link
@@ -0,0 +1 @@
|
||||
../sherpa-onnx/kotlin-api/OfflineRecognizer.kt
|
||||
@@ -1 +1 @@
|
||||
../android/SherpaOnnxAudioTagging/app/src/main/java/com/k2fsa/sherpa/onnx/audio/tagging/OfflineStream.kt
|
||||
../sherpa-onnx/kotlin-api/OfflineStream.kt
|
||||
1
kotlin-api-examples/OnlineRecognizer.kt
Symbolic link
1
kotlin-api-examples/OnlineRecognizer.kt
Symbolic link
@@ -0,0 +1 @@
|
||||
../sherpa-onnx/kotlin-api/OnlineRecognizer.kt
|
||||
1
kotlin-api-examples/OnlineStream.kt
Symbolic link
1
kotlin-api-examples/OnlineStream.kt
Symbolic link
@@ -0,0 +1 @@
|
||||
../sherpa-onnx/kotlin-api/OnlineStream.kt
|
||||
@@ -1 +0,0 @@
|
||||
../android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt
|
||||
@@ -1 +0,0 @@
|
||||
../android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt
|
||||
@@ -1 +1 @@
|
||||
../android/SherpaOnnxSpeakerIdentification/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/identification/Speaker.kt
|
||||
../sherpa-onnx/kotlin-api/Speaker.kt
|
||||
@@ -1 +1 @@
|
||||
../android/SherpaOnnxSpokenLanguageIdentification/app/src/main/java/com/k2fsa/sherpa/onnx/slid/SpokenLanguageIdentification.kt
|
||||
../sherpa-onnx/kotlin-api/SpokenLanguageIdentification.kt
|
||||
@@ -1 +1 @@
|
||||
../android/SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
|
||||
../sherpa-onnx/kotlin-api/Vad.kt
|
||||
@@ -1 +1 @@
|
||||
../android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/WaveReader.kt
|
||||
../sherpa-onnx/kotlin-api/WaveReader.kt
|
||||
@@ -44,9 +44,23 @@ function testSpeakerEmbeddingExtractor() {
|
||||
if [ ! -f ./speaker2_a_cn_16k.wav ]; then
|
||||
curl -SL -O https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav
|
||||
fi
|
||||
|
||||
out_filename=test_speaker_id.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_speaker_id.kt \
|
||||
OnlineStream.kt \
|
||||
Speaker.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
function testAsr() {
|
||||
|
||||
function testOnlineAsr() {
|
||||
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
|
||||
git lfs install
|
||||
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
|
||||
@@ -57,6 +71,20 @@ function testAsr() {
|
||||
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
|
||||
fi
|
||||
|
||||
out_filename=test_online_asr.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_online_asr.kt \
|
||||
FeatureConfig.kt \
|
||||
OnlineRecognizer.kt \
|
||||
OnlineStream.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
function testTts() {
|
||||
@@ -65,16 +93,42 @@ function testTts() {
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
rm vits-piper-en_US-amy-low.tar.bz2
|
||||
fi
|
||||
|
||||
out_filename=test_tts.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_tts.kt \
|
||||
Tts.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
|
||||
function testAudioTagging() {
|
||||
if [ ! -d sherpa-onnx-zipformer-audio-tagging-2024-04-09 ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
|
||||
rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
|
||||
fi
|
||||
|
||||
out_filename=test_audio_tagging.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_audio_tagging.kt \
|
||||
AudioTagging.kt \
|
||||
OfflineStream.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
|
||||
function testSpokenLanguageIdentification() {
|
||||
if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
|
||||
@@ -87,50 +141,44 @@ function testSpokenLanguageIdentification() {
|
||||
tar xvf spoken-language-identification-test-wavs.tar.bz2
|
||||
rm spoken-language-identification-test-wavs.tar.bz2
|
||||
fi
|
||||
|
||||
out_filename=test_language_id.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_language_id.kt \
|
||||
SpokenLanguageIdentification.kt \
|
||||
OfflineStream.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
function test() {
|
||||
testSpokenLanguageIdentification
|
||||
testAudioTagging
|
||||
testSpeakerEmbeddingExtractor
|
||||
testAsr
|
||||
testTts
|
||||
}
|
||||
|
||||
test
|
||||
|
||||
kotlinc-jvm -include-runtime -d main.jar \
|
||||
AudioTagging.kt \
|
||||
Main.kt \
|
||||
OfflineStream.kt \
|
||||
SherpaOnnx.kt \
|
||||
Speaker.kt \
|
||||
SpokenLanguageIdentification.kt \
|
||||
Tts.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt \
|
||||
faked-log.kt
|
||||
|
||||
ls -lh main.jar
|
||||
|
||||
java -Djava.library.path=../build/lib -jar main.jar
|
||||
|
||||
function testTwoPass() {
|
||||
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
|
||||
fi
|
||||
|
||||
function testOfflineAsr() {
|
||||
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
rm sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
fi
|
||||
|
||||
kotlinc-jvm -include-runtime -d 2pass.jar test-2pass.kt WaveReader.kt SherpaOnnx2Pass.kt faked-asset-manager.kt
|
||||
ls -lh 2pass.jar
|
||||
java -Djava.library.path=../build/lib -jar 2pass.jar
|
||||
out_filename=test_offline_asr.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_offline_asr.kt \
|
||||
FeatureConfig.kt \
|
||||
OfflineRecognizer.kt \
|
||||
OfflineStream.kt \
|
||||
WaveReader.kt \
|
||||
faked-asset-manager.kt
|
||||
|
||||
ls -lh $out_filename
|
||||
java -Djava.library.path=../build/lib -jar $out_filename
|
||||
}
|
||||
|
||||
testTwoPass
|
||||
testSpeakerEmbeddingExtractor
|
||||
testOnlineAsr
|
||||
testTts
|
||||
testAudioTagging
|
||||
testSpokenLanguageIdentification
|
||||
testOfflineAsr
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
test2Pass()
|
||||
}
|
||||
|
||||
fun test2Pass() {
|
||||
val firstPass = createFirstPass()
|
||||
val secondPass = createSecondPass()
|
||||
|
||||
val waveFilename = "./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"
|
||||
|
||||
var objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
var samples: FloatArray = objArray[0] as FloatArray
|
||||
var sampleRate: Int = objArray[1] as Int
|
||||
|
||||
firstPass.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
while (firstPass.isReady()) {
|
||||
firstPass.decode()
|
||||
}
|
||||
|
||||
var text = firstPass.text
|
||||
println("First pass text: $text")
|
||||
|
||||
text = secondPass.decode(samples, sampleRate)
|
||||
println("Second pass text: $text")
|
||||
}
|
||||
|
||||
fun createFirstPass(): SherpaOnnx {
|
||||
val config = OnlineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
|
||||
modelConfig = getModelConfig(type = 1)!!,
|
||||
endpointConfig = getEndpointConfig(),
|
||||
enableEndpoint = true,
|
||||
)
|
||||
|
||||
return SherpaOnnx(config = config)
|
||||
}
|
||||
|
||||
fun createSecondPass(): SherpaOnnxOffline {
|
||||
val config = OfflineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
|
||||
modelConfig = getOfflineModelConfig(type = 2)!!,
|
||||
)
|
||||
|
||||
return SherpaOnnxOffline(config = config)
|
||||
}
|
||||
49
kotlin-api-examples/test_audio_tagging.kt
Normal file
49
kotlin-api-examples/test_audio_tagging.kt
Normal file
@@ -0,0 +1,49 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
testAudioTagging()
|
||||
}
|
||||
|
||||
fun testAudioTagging() {
|
||||
val config = AudioTaggingConfig(
|
||||
model=AudioTaggingModelConfig(
|
||||
zipformer=OfflineZipformerAudioTaggingModelConfig(
|
||||
model="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx",
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
provider="cpu",
|
||||
),
|
||||
labels="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv",
|
||||
topK=5,
|
||||
)
|
||||
val tagger = AudioTagging(config=config)
|
||||
|
||||
val testFiles = arrayOf(
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/2.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/3.wav",
|
||||
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/4.wav",
|
||||
)
|
||||
println("----------")
|
||||
for (waveFilename in testFiles) {
|
||||
val stream = tagger.createStream()
|
||||
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
val events = tagger.compute(stream)
|
||||
stream.release()
|
||||
|
||||
println(waveFilename)
|
||||
println(events)
|
||||
println("----------")
|
||||
}
|
||||
|
||||
tagger.release()
|
||||
}
|
||||
|
||||
43
kotlin-api-examples/test_language_id.kt
Normal file
43
kotlin-api-examples/test_language_id.kt
Normal file
@@ -0,0 +1,43 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
testSpokenLanguageIdentifcation()
|
||||
}
|
||||
|
||||
fun testSpokenLanguageIdentifcation() {
|
||||
val config = SpokenLanguageIdentificationConfig(
|
||||
whisper = SpokenLanguageIdentificationWhisperConfig(
|
||||
encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx",
|
||||
decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx",
|
||||
tailPaddings = 33,
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
provider="cpu",
|
||||
)
|
||||
val slid = SpokenLanguageIdentification(config=config)
|
||||
|
||||
val testFiles = arrayOf(
|
||||
"./spoken-language-identification-test-wavs/ar-arabic.wav",
|
||||
"./spoken-language-identification-test-wavs/bg-bulgarian.wav",
|
||||
"./spoken-language-identification-test-wavs/de-german.wav",
|
||||
)
|
||||
|
||||
for (waveFilename in testFiles) {
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = slid.createStream()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
val lang = slid.compute(stream)
|
||||
stream.release()
|
||||
println(waveFilename)
|
||||
println(lang)
|
||||
}
|
||||
|
||||
slid.release()
|
||||
}
|
||||
|
||||
32
kotlin-api-examples/test_offline_asr.kt
Normal file
32
kotlin-api-examples/test_offline_asr.kt
Normal file
@@ -0,0 +1,32 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
val recognizer = createOfflineRecognizer()
|
||||
|
||||
val waveFilename = "./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"
|
||||
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = recognizer.createStream()
|
||||
stream.acceptWaveform(samples, sampleRate=sampleRate)
|
||||
recognizer.decode(stream)
|
||||
|
||||
val result = recognizer.getResult(stream)
|
||||
println(result)
|
||||
|
||||
stream.release()
|
||||
recognizer.release()
|
||||
}
|
||||
|
||||
fun createOfflineRecognizer(): OfflineRecognizer {
|
||||
val config = OfflineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80),
|
||||
modelConfig = getOfflineModelConfig(type = 2)!!,
|
||||
)
|
||||
|
||||
return OfflineRecognizer(config = config)
|
||||
}
|
||||
87
kotlin-api-examples/test_online_asr.kt
Normal file
87
kotlin-api-examples/test_online_asr.kt
Normal file
@@ -0,0 +1,87 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
testOnlineAsr("transducer")
|
||||
testOnlineAsr("zipformer2-ctc")
|
||||
}
|
||||
|
||||
fun testOnlineAsr(type: String) {
|
||||
val featConfig = FeatureConfig(
|
||||
sampleRate = 16000,
|
||||
featureDim = 80,
|
||||
)
|
||||
|
||||
val waveFilename: String
|
||||
val modelConfig: OnlineModelConfig = when (type) {
|
||||
"transducer" -> {
|
||||
waveFilename = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/test_wavs/0.wav"
|
||||
// please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// to dowload pre-trained models
|
||||
OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/encoder-epoch-99-avg-1.onnx",
|
||||
decoder = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
tokens = "./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt",
|
||||
numThreads = 1,
|
||||
debug = false,
|
||||
)
|
||||
}
|
||||
"zipformer2-ctc" -> {
|
||||
waveFilename = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav"
|
||||
OnlineModelConfig(
|
||||
zipformer2Ctc = OnlineZipformer2CtcModelConfig(
|
||||
model = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx",
|
||||
),
|
||||
tokens = "./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt",
|
||||
numThreads = 1,
|
||||
debug = false,
|
||||
)
|
||||
}
|
||||
else -> throw IllegalArgumentException(type)
|
||||
}
|
||||
|
||||
val endpointConfig = EndpointConfig()
|
||||
|
||||
val lmConfig = OnlineLMConfig()
|
||||
|
||||
val config = OnlineRecognizerConfig(
|
||||
modelConfig = modelConfig,
|
||||
lmConfig = lmConfig,
|
||||
featConfig = featConfig,
|
||||
endpointConfig = endpointConfig,
|
||||
enableEndpoint = true,
|
||||
decodingMethod = "greedy_search",
|
||||
maxActivePaths = 4,
|
||||
)
|
||||
|
||||
val recognizer = OnlineRecognizer(
|
||||
config = config,
|
||||
)
|
||||
|
||||
val objArray = WaveReader.readWaveFromFile(
|
||||
filename = waveFilename,
|
||||
)
|
||||
val samples: FloatArray = objArray[0] as FloatArray
|
||||
val sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = recognizer.createStream()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRate)
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream)
|
||||
}
|
||||
|
||||
val tailPaddings = FloatArray((sampleRate * 0.5).toInt()) // 0.5 seconds
|
||||
stream.acceptWaveform(tailPaddings, sampleRate = sampleRate)
|
||||
stream.inputFinished()
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream)
|
||||
}
|
||||
|
||||
println("results: ${recognizer.getResult(stream).text}")
|
||||
|
||||
stream.release()
|
||||
recognizer.release()
|
||||
}
|
||||
62
kotlin-api-examples/test_speaker_id.kt
Normal file
62
kotlin-api-examples/test_speaker_id.kt
Normal file
@@ -0,0 +1,62 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
testSpeakerRecognition()
|
||||
}
|
||||
|
||||
fun testSpeakerRecognition() {
|
||||
val config = SpeakerEmbeddingExtractorConfig(
|
||||
model="./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx",
|
||||
)
|
||||
val extractor = SpeakerEmbeddingExtractor(config = config)
|
||||
|
||||
val embedding1a = computeEmbedding(extractor, "./speaker1_a_cn_16k.wav")
|
||||
val embedding2a = computeEmbedding(extractor, "./speaker2_a_cn_16k.wav")
|
||||
val embedding1b = computeEmbedding(extractor, "./speaker1_b_cn_16k.wav")
|
||||
|
||||
var manager = SpeakerEmbeddingManager(extractor.dim())
|
||||
var ok = manager.add(name = "speaker1", embedding=embedding1a)
|
||||
check(ok)
|
||||
|
||||
manager.add(name = "speaker2", embedding=embedding2a)
|
||||
check(ok)
|
||||
|
||||
var name = manager.search(embedding=embedding1b, threshold=0.5f)
|
||||
check(name == "speaker1")
|
||||
|
||||
manager.release()
|
||||
|
||||
manager = SpeakerEmbeddingManager(extractor.dim())
|
||||
val embeddingList = mutableListOf(embedding1a, embedding1b)
|
||||
ok = manager.add(name = "s1", embedding=embeddingList.toTypedArray())
|
||||
check(ok)
|
||||
|
||||
name = manager.search(embedding=embedding1b, threshold=0.5f)
|
||||
check(name == "s1")
|
||||
|
||||
name = manager.search(embedding=embedding2a, threshold=0.5f)
|
||||
check(name.length == 0)
|
||||
|
||||
manager.release()
|
||||
extractor.release()
|
||||
println("Speaker ID test done!")
|
||||
}
|
||||
|
||||
fun computeEmbedding(extractor: SpeakerEmbeddingExtractor, filename: String): FloatArray {
|
||||
var objArray = WaveReader.readWaveFromFile(
|
||||
filename = filename,
|
||||
)
|
||||
var samples: FloatArray = objArray[0] as FloatArray
|
||||
var sampleRate: Int = objArray[1] as Int
|
||||
|
||||
val stream = extractor.createStream()
|
||||
stream.acceptWaveform(sampleRate = sampleRate, samples=samples)
|
||||
stream.inputFinished()
|
||||
check(extractor.isReady(stream))
|
||||
|
||||
val embedding = extractor.compute(stream)
|
||||
|
||||
stream.release()
|
||||
|
||||
return embedding
|
||||
}
|
||||
30
kotlin-api-examples/test_tts.kt
Normal file
30
kotlin-api-examples/test_tts.kt
Normal file
@@ -0,0 +1,30 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
testTts()
|
||||
}
|
||||
|
||||
fun testTts() {
|
||||
// see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
var config = OfflineTtsConfig(
|
||||
model=OfflineTtsModelConfig(
|
||||
vits=OfflineTtsVitsModelConfig(
|
||||
model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx",
|
||||
tokens="./vits-piper-en_US-amy-low/tokens.txt",
|
||||
dataDir="./vits-piper-en_US-amy-low/espeak-ng-data",
|
||||
),
|
||||
numThreads=1,
|
||||
debug=true,
|
||||
)
|
||||
)
|
||||
val tts = OfflineTts(config=config)
|
||||
val audio = tts.generateWithCallback(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”", callback=::callback)
|
||||
audio.save(filename="test-en.wav")
|
||||
tts.release()
|
||||
println("Saved to test-en.wav")
|
||||
}
|
||||
|
||||
fun callback(samples: FloatArray): Unit {
|
||||
println("callback got called with ${samples.size} samples");
|
||||
}
|
||||
Reference in New Issue
Block a user