Add Kotlin API for audio tagging (#770)

This commit is contained in:
Fangjun Kuang
2024-04-15 13:49:35 +08:00
committed by GitHub
parent 13730ecbd8
commit 5981adf454
17 changed files with 601 additions and 56 deletions

View File

@@ -0,0 +1,95 @@
package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
import android.util.Log
private val TAG = "sherpa-onnx"
data class OfflineZipformerAudioTaggingModelConfig (
val model: String,
)
data class AudioTaggingModelConfig (
var zipformer: OfflineZipformerAudioTaggingModelConfig,
var numThreads: Int = 1,
var debug: Boolean = false,
var provider: String = "cpu",
)
data class AudioTaggingConfig (
var model: AudioTaggingModelConfig,
var labels: String,
var topK: Int = 5,
)
data class AudioEvent (
val name: String,
val index: Int,
val prob: Float,
)
class AudioTagging(
assetManager: AssetManager? = null,
config: AudioTaggingConfig,
) {
private var ptr: Long
init {
ptr = if (assetManager != null) {
newFromAsset(assetManager, config)
} else {
newFromFile(config)
}
}
protected fun finalize() {
if(ptr != 0L) {
delete(ptr)
ptr = 0
}
}
fun release() = finalize()
fun createStream(): OfflineStream {
val p = createStream(ptr)
return OfflineStream(p)
}
fun compute(stream: OfflineStream, topK: Int=-1): ArrayList<AudioEvent> {
var events :Array<Any> = compute(ptr, stream.ptr, topK)
val ans = ArrayList<AudioEvent>()
for (e in events) {
val p :Array<Any> = e as Array<Any>
ans.add(AudioEvent(
name=p[0] as String,
index=p[1] as Int,
prob=p[2] as Float,
))
}
return ans
}
private external fun newFromAsset(
assetManager: AssetManager,
config: AudioTaggingConfig,
): Long
private external fun newFromFile(
config: AudioTaggingConfig,
): Long
private external fun delete(ptr: Long)
private external fun createStream(ptr: Long): Long
private external fun compute(ptr: Long, streamPtr: Long, topK: Int): Array<Any>
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}

View File

@@ -7,12 +7,56 @@ fun callback(samples: FloatArray): Unit {
}
fun main() {
testAudioTagging()
testSpeakerRecognition()
testTts()
testAsr("transducer")
testAsr("zipformer2-ctc")
}
fun testAudioTagging() {
val config = AudioTaggingConfig(
model=AudioTaggingModelConfig(
zipformer=OfflineZipformerAudioTaggingModelConfig(
model="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx",
),
numThreads=1,
debug=true,
provider="cpu",
),
labels="./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv",
topK=5,
)
val tagger = AudioTagging(assetManager=null, config=config)
val testFiles = arrayOf(
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/2.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/3.wav",
"./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/4.wav",
)
println("----------")
for (waveFilename in testFiles) {
val stream = tagger.createStream()
val objArray = WaveReader.readWaveFromFile(
filename = waveFilename,
)
val samples: FloatArray = objArray[0] as FloatArray
val sampleRate: Int = objArray[1] as Int
stream.acceptWaveform(samples, sampleRate = sampleRate)
val events = tagger.compute(stream)
stream.release()
println(waveFilename)
println(events)
println("----------")
}
tagger.release()
}
fun computeEmbedding(extractor: SpeakerEmbeddingExtractor, filename: String): FloatArray {
var objArray = WaveReader.readWaveFromFile(
filename = filename,

View File

@@ -0,0 +1,24 @@
package com.k2fsa.sherpa.onnx
class OfflineStream(var ptr: Long) {
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
acceptWaveform(ptr, samples, sampleRate)
protected fun finalize() {
if(ptr != 0L) {
delete(ptr)
ptr = 0
}
}
fun release() = finalize()
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
private external fun delete(ptr: Long)
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}

View File

@@ -4,8 +4,7 @@
# Note: This scripts runs only on Linux and macOS, though sherpa-onnx
# supports building JNI libs for Windows.
set -e
set -ex
cd ..
mkdir -p build
@@ -29,59 +28,93 @@ export LD_LIBRARY_PATH=$PWD/build/lib:$LD_LIBRARY_PATH
cd ../kotlin-api-examples
if [ ! -f ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx
fi
function testSpeakerEmbeddingExtractor() {
if [ ! -f ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx
fi
if [ ! -f ./speaker1_a_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_cn_16k.wav
fi
if [ ! -f ./speaker1_a_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_a_cn_16k.wav
fi
if [ ! -f ./speaker1_b_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_cn_16k.wav
fi
if [ ! -f ./speaker1_b_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker1_b_cn_16k.wav
fi
if [ ! -f ./speaker2_a_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav
fi
if [ ! -f ./speaker2_a_cn_16k.wav ]; then
wget -q https://github.com/csukuangfj/sr-data/raw/main/test/3d-speaker/speaker2_a_cn_16k.wav
fi
}
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git lfs install
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
fi
function testAsr() {
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then
git lfs install
git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21
fi
if [ ! -d ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
fi
if [ ! -d ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
fi
}
if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
rm vits-piper-en_US-amy-low.tar.bz2
fi
function testTts() {
if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
rm vits-piper-en_US-amy-low.tar.bz2
fi
}
kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt Speaker.kt faked-log.kt
function testAudioTagging() {
if [ ! -d sherpa-onnx-zipformer-audio-tagging-2024-04-09 ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
fi
}
function test() {
testAudioTagging
testSpeakerEmbeddingExtractor
testAsr
testTts
}
test
kotlinc-jvm -include-runtime -d main.jar \
AudioTagging.kt \
Main.kt \
OfflineStream.kt \
SherpaOnnx.kt \
Speaker.kt \
Tts.kt \
WaveReader.kt \
faked-asset-manager.kt \
faked-log.kt
ls -lh main.jar
java -Djava.library.path=../build/lib -jar main.jar
# For two-pass
function testTwoPass() {
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
fi
if [ ! -f ./sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.int8.onnx ]; then
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
rm sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
fi
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
fi
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx ]; then
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
rm sherpa-onnx-whisper-tiny.en.tar.bz2
fi
kotlinc-jvm -include-runtime -d 2pass.jar test-2pass.kt WaveReader.kt SherpaOnnx2Pass.kt faked-asset-manager.kt
ls -lh 2pass.jar
java -Djava.library.path=../build/lib -jar 2pass.jar
}
kotlinc-jvm -include-runtime -d 2pass.jar test-2pass.kt WaveReader.kt SherpaOnnx2Pass.kt faked-asset-manager.kt
ls -lh 2pass.jar
java -Djava.library.path=../build/lib -jar 2pass.jar
testTwoPass