Refactor the JNI interface to make it more modular and maintainable (#802)
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:label="ASR: Next-gen Kaldi"
|
||||
android:exported="true">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/FeatureConfig.kt
|
||||
@@ -12,16 +12,19 @@ import android.widget.Button
|
||||
import android.widget.TextView
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
private const val TAG = "sherpa-onnx"
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
// To enable microphone in android emulator, use
|
||||
//
|
||||
// adb emu avd hostmicon
|
||||
|
||||
class MainActivity : AppCompatActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
private lateinit var model: SherpaOnnx
|
||||
private lateinit var recognizer: OnlineRecognizer
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private lateinit var recordButton: Button
|
||||
private lateinit var textView: TextView
|
||||
@@ -87,7 +90,6 @@ class MainActivity : AppCompatActivity() {
|
||||
audioRecord!!.startRecording()
|
||||
recordButton.setText(R.string.stop)
|
||||
isRecording = true
|
||||
model.reset(true)
|
||||
textView.text = ""
|
||||
lastText = ""
|
||||
idx = 0
|
||||
@@ -108,6 +110,7 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
private fun processSamples() {
|
||||
Log.i(TAG, "processing samples")
|
||||
val stream = recognizer.createStream()
|
||||
|
||||
val interval = 0.1 // i.e., 100 ms
|
||||
val bufferSize = (interval * sampleRateInHz).toInt() // in samples
|
||||
@@ -117,29 +120,41 @@ class MainActivity : AppCompatActivity() {
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size)
|
||||
if (ret != null && ret > 0) {
|
||||
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
model.acceptWaveform(samples, sampleRate=sampleRateInHz)
|
||||
while (model.isReady()) {
|
||||
model.decode()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRateInHz)
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream)
|
||||
}
|
||||
|
||||
val isEndpoint = model.isEndpoint()
|
||||
val text = model.text
|
||||
val isEndpoint = recognizer.isEndpoint(stream)
|
||||
var text = recognizer.getResult(stream).text
|
||||
|
||||
var textToDisplay = lastText;
|
||||
// For streaming parformer, we need to manually add some
|
||||
// paddings so that it has enough right context to
|
||||
// recognize the last word of this segment
|
||||
if (isEndpoint && recognizer.config.modelConfig.paraformer.encoder.isNotBlank()) {
|
||||
val tailPaddings = FloatArray((0.8 * sampleRateInHz).toInt())
|
||||
stream.acceptWaveform(tailPaddings, sampleRate = sampleRateInHz)
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream)
|
||||
}
|
||||
text = recognizer.getResult(stream).text
|
||||
}
|
||||
|
||||
if(text.isNotBlank()) {
|
||||
if (lastText.isBlank()) {
|
||||
textToDisplay = "${idx}: ${text}"
|
||||
var textToDisplay = lastText
|
||||
|
||||
if (text.isNotBlank()) {
|
||||
textToDisplay = if (lastText.isBlank()) {
|
||||
"${idx}: $text"
|
||||
} else {
|
||||
textToDisplay = "${lastText}\n${idx}: ${text}"
|
||||
"${lastText}\n${idx}: $text"
|
||||
}
|
||||
}
|
||||
|
||||
if (isEndpoint) {
|
||||
model.reset()
|
||||
recognizer.reset(stream)
|
||||
if (text.isNotBlank()) {
|
||||
lastText = "${lastText}\n${idx}: ${text}"
|
||||
textToDisplay = lastText;
|
||||
lastText = "${lastText}\n${idx}: $text"
|
||||
textToDisplay = lastText
|
||||
idx += 1
|
||||
}
|
||||
}
|
||||
@@ -149,6 +164,7 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.release()
|
||||
}
|
||||
|
||||
private fun initMicrophone(): Boolean {
|
||||
@@ -180,7 +196,7 @@ class MainActivity : AppCompatActivity() {
|
||||
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// for a list of available models
|
||||
val type = 0
|
||||
println("Select model type ${type}")
|
||||
Log.i(TAG, "Select model type $type")
|
||||
val config = OnlineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
|
||||
modelConfig = getModelConfig(type = type)!!,
|
||||
@@ -189,7 +205,7 @@ class MainActivity : AppCompatActivity() {
|
||||
enableEndpoint = true,
|
||||
)
|
||||
|
||||
model = SherpaOnnx(
|
||||
recognizer = OnlineRecognizer(
|
||||
assetManager = application.assets,
|
||||
config = config,
|
||||
)
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineRecognizer.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineStream.kt
|
||||
@@ -1,322 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class EndpointRule(
|
||||
var mustContainNonSilence: Boolean,
|
||||
var minTrailingSilence: Float,
|
||||
var minUtteranceLength: Float,
|
||||
)
|
||||
|
||||
data class EndpointConfig(
|
||||
var rule1: EndpointRule = EndpointRule(false, 2.4f, 0.0f),
|
||||
var rule2: EndpointRule = EndpointRule(true, 1.4f, 0.0f),
|
||||
var rule3: EndpointRule = EndpointRule(false, 0.0f, 20.0f)
|
||||
)
|
||||
|
||||
data class OnlineTransducerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var joiner: String = "",
|
||||
)
|
||||
|
||||
data class OnlineParaformerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
)
|
||||
|
||||
data class OnlineZipformer2CtcModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OnlineModelConfig(
|
||||
var transducer: OnlineTransducerModelConfig = OnlineTransducerModelConfig(),
|
||||
var paraformer: OnlineParaformerModelConfig = OnlineParaformerModelConfig(),
|
||||
var zipformer2Ctc: OnlineZipformer2CtcModelConfig = OnlineZipformer2CtcModelConfig(),
|
||||
var tokens: String,
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
var modelType: String = "",
|
||||
)
|
||||
|
||||
data class OnlineLMConfig(
|
||||
var model: String = "",
|
||||
var scale: Float = 0.5f,
|
||||
)
|
||||
|
||||
data class FeatureConfig(
|
||||
var sampleRate: Int = 16000,
|
||||
var featureDim: Int = 80,
|
||||
)
|
||||
|
||||
data class OnlineRecognizerConfig(
|
||||
var featConfig: FeatureConfig = FeatureConfig(),
|
||||
var modelConfig: OnlineModelConfig,
|
||||
var lmConfig: OnlineLMConfig,
|
||||
var endpointConfig: EndpointConfig = EndpointConfig(),
|
||||
var enableEndpoint: Boolean = true,
|
||||
var decodingMethod: String = "greedy_search",
|
||||
var maxActivePaths: Int = 4,
|
||||
var hotwordsFile: String = "",
|
||||
var hotwordsScore: Float = 1.5f,
|
||||
)
|
||||
|
||||
class SherpaOnnx(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: OnlineRecognizerConfig,
|
||||
) {
|
||||
private val ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = new(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
}
|
||||
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
fun inputFinished() = inputFinished(ptr)
|
||||
fun reset(recreate: Boolean = false, hotwords: String = "") = reset(ptr, recreate, hotwords)
|
||||
fun decode() = decode(ptr)
|
||||
fun isEndpoint(): Boolean = isEndpoint(ptr)
|
||||
fun isReady(): Boolean = isReady(ptr)
|
||||
|
||||
val text: String
|
||||
get() = getText(ptr)
|
||||
|
||||
val tokens: Array<String>
|
||||
get() = getTokens(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: OnlineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: OnlineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
private external fun inputFinished(ptr: Long)
|
||||
private external fun getText(ptr: Long): String
|
||||
private external fun reset(ptr: Long, recreate: Boolean, hotwords: String)
|
||||
private external fun decode(ptr: Long)
|
||||
private external fun isEndpoint(ptr: Long): Boolean
|
||||
private external fun isReady(ptr: Long): Boolean
|
||||
private external fun getTokens(ptr: Long): Array<String>
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun getFeatureConfig(sampleRate: Int, featureDim: Int): FeatureConfig {
|
||||
return FeatureConfig(sampleRate = sampleRate, featureDim = featureDim)
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own. (It should be straightforward to add a new model
|
||||
by following the code)
|
||||
|
||||
@param type
|
||||
0 - sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english
|
||||
|
||||
1 - csukuangfj/sherpa-onnx-lstm-zh-2023-02-20 (Chinese)
|
||||
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/lstm-transducer-models.html#csukuangfj-sherpa-onnx-lstm-zh-2023-02-20-chinese
|
||||
|
||||
2 - csukuangfj/sherpa-onnx-lstm-en-2023-02-17 (English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/lstm-transducer-models.html#csukuangfj-sherpa-onnx-lstm-en-2023-02-17-english
|
||||
|
||||
3,4 - pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615
|
||||
https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615
|
||||
3 - int8 encoder
|
||||
4 - float32 encoder
|
||||
|
||||
5 - csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
|
||||
https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
|
||||
|
||||
6 - sherpa-onnx-streaming-zipformer-en-2023-06-26
|
||||
https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
|
||||
|
||||
7 - shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14 (French)
|
||||
https://huggingface.co/shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14
|
||||
|
||||
8 - csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English)
|
||||
https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
|
||||
encoder int8, decoder/joiner float32
|
||||
|
||||
*/
|
||||
fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-lstm-zh-2023-02-20"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-11-avg-1.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-11-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-11-avg-1.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "lstm",
|
||||
)
|
||||
}
|
||||
|
||||
2 -> {
|
||||
val modelDir = "sherpa-onnx-lstm-en-2023-02-17"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "lstm",
|
||||
)
|
||||
}
|
||||
|
||||
3 -> {
|
||||
val modelDir = "icefall-asr-zipformer-streaming-wenetspeech-20230615"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/exp/encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx",
|
||||
decoder = "$modelDir/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx",
|
||||
joiner = "$modelDir/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
||||
),
|
||||
tokens = "$modelDir/data/lang_char/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
4 -> {
|
||||
val modelDir = "icefall-asr-zipformer-streaming-wenetspeech-20230615"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx",
|
||||
decoder = "$modelDir/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx",
|
||||
joiner = "$modelDir/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx",
|
||||
),
|
||||
tokens = "$modelDir/data/lang_char/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
5 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-paraformer-bilingual-zh-en"
|
||||
return OnlineModelConfig(
|
||||
paraformer = OnlineParaformerModelConfig(
|
||||
encoder = "$modelDir/encoder.int8.onnx",
|
||||
decoder = "$modelDir/decoder.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "paraformer",
|
||||
)
|
||||
}
|
||||
|
||||
6 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-en-2023-06-26"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1-chunk-16-left-128.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1-chunk-16-left-128.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
7 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-fr-2023-04-14"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-29-avg-9-with-averaged-model.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-29-avg-9-with-averaged-model.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-29-avg-9-with-averaged-model.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
|
||||
8 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own LM model. (It should be straightforward to train a new NN LM model
|
||||
by following the code, https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py)
|
||||
|
||||
@param type
|
||||
0 - sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english
|
||||
*/
|
||||
fun getOnlineLMConfig(type: Int): OnlineLMConfig {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"
|
||||
return OnlineLMConfig(
|
||||
model = "$modelDir/with-state-epoch-99-avg-1.int8.onnx",
|
||||
scale = 0.5f,
|
||||
)
|
||||
}
|
||||
}
|
||||
return OnlineLMConfig();
|
||||
}
|
||||
|
||||
fun getEndpointConfig(): EndpointConfig {
|
||||
return EndpointConfig(
|
||||
rule1 = EndpointRule(false, 2.4f, 0.0f),
|
||||
rule2 = EndpointRule(true, 1.4f, 0.0f),
|
||||
rule3 = EndpointRule(false, 0.0f, 20.0f)
|
||||
)
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
class WaveReader {
|
||||
companion object {
|
||||
// Read a mono wave file asset
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromAsset(
|
||||
assetManager: AssetManager,
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
// Read a mono wave file from disk
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromFile(
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/WaveReader.kt
|
||||
@@ -16,6 +16,7 @@
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:label="2pass ASR: Next-gen Kaldi"
|
||||
android:exported="true">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
@@ -29,4 +30,4 @@
|
||||
</activity>
|
||||
</application>
|
||||
|
||||
</manifest>
|
||||
</manifest>
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/FeatureConfig.kt
|
||||
@@ -17,11 +17,13 @@ import kotlin.concurrent.thread
|
||||
private const val TAG = "sherpa-onnx"
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
// adb emu avd hostmicon
|
||||
// to enable microphone inside the emulator
|
||||
class MainActivity : AppCompatActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
private lateinit var onlineRecognizer: SherpaOnnx
|
||||
private lateinit var offlineRecognizer: SherpaOnnxOffline
|
||||
private lateinit var onlineRecognizer: OnlineRecognizer
|
||||
private lateinit var offlineRecognizer: OfflineRecognizer
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private lateinit var recordButton: Button
|
||||
private lateinit var textView: TextView
|
||||
@@ -93,7 +95,6 @@ class MainActivity : AppCompatActivity() {
|
||||
audioRecord!!.startRecording()
|
||||
recordButton.setText(R.string.stop)
|
||||
isRecording = true
|
||||
onlineRecognizer.reset(true)
|
||||
samplesBuffer.clear()
|
||||
textView.text = ""
|
||||
lastText = ""
|
||||
@@ -115,6 +116,7 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
private fun processSamples() {
|
||||
Log.i(TAG, "processing samples")
|
||||
val stream = onlineRecognizer.createStream()
|
||||
|
||||
val interval = 0.1 // i.e., 100 ms
|
||||
val bufferSize = (interval * sampleRateInHz).toInt() // in samples
|
||||
@@ -126,29 +128,29 @@ class MainActivity : AppCompatActivity() {
|
||||
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
samplesBuffer.add(samples)
|
||||
|
||||
onlineRecognizer.acceptWaveform(samples, sampleRate = sampleRateInHz)
|
||||
while (onlineRecognizer.isReady()) {
|
||||
onlineRecognizer.decode()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRateInHz)
|
||||
while (onlineRecognizer.isReady(stream)) {
|
||||
onlineRecognizer.decode(stream)
|
||||
}
|
||||
val isEndpoint = onlineRecognizer.isEndpoint()
|
||||
val isEndpoint = onlineRecognizer.isEndpoint(stream)
|
||||
var textToDisplay = lastText
|
||||
|
||||
var text = onlineRecognizer.text
|
||||
var text = onlineRecognizer.getResult(stream).text
|
||||
if (text.isNotBlank()) {
|
||||
if (lastText.isBlank()) {
|
||||
textToDisplay = if (lastText.isBlank()) {
|
||||
// textView.text = "${idx}: ${text}"
|
||||
textToDisplay = "${idx}: ${text}"
|
||||
"${idx}: $text"
|
||||
} else {
|
||||
textToDisplay = "${lastText}\n${idx}: ${text}"
|
||||
"${lastText}\n${idx}: $text"
|
||||
}
|
||||
}
|
||||
|
||||
if (isEndpoint) {
|
||||
onlineRecognizer.reset()
|
||||
onlineRecognizer.reset(stream)
|
||||
|
||||
if (text.isNotBlank()) {
|
||||
text = runSecondPass()
|
||||
lastText = "${lastText}\n${idx}: ${text}"
|
||||
lastText = "${lastText}\n${idx}: $text"
|
||||
idx += 1
|
||||
} else {
|
||||
samplesBuffer.clear()
|
||||
@@ -160,6 +162,7 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
}
|
||||
stream.release()
|
||||
}
|
||||
|
||||
private fun initMicrophone(): Boolean {
|
||||
@@ -190,8 +193,8 @@ class MainActivity : AppCompatActivity() {
|
||||
// Please change getModelConfig() to add new models
|
||||
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// for a list of available models
|
||||
val firstType = 1
|
||||
println("Select model type ${firstType} for the first pass")
|
||||
val firstType = 9
|
||||
Log.i(TAG, "Select model type $firstType for the first pass")
|
||||
val config = OnlineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
|
||||
modelConfig = getModelConfig(type = firstType)!!,
|
||||
@@ -199,7 +202,7 @@ class MainActivity : AppCompatActivity() {
|
||||
enableEndpoint = true,
|
||||
)
|
||||
|
||||
onlineRecognizer = SherpaOnnx(
|
||||
onlineRecognizer = OnlineRecognizer(
|
||||
assetManager = application.assets,
|
||||
config = config,
|
||||
)
|
||||
@@ -209,15 +212,15 @@ class MainActivity : AppCompatActivity() {
|
||||
// Please change getOfflineModelConfig() to add new models
|
||||
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// for a list of available models
|
||||
val secondType = 1
|
||||
println("Select model type ${secondType} for the second pass")
|
||||
val secondType = 0
|
||||
Log.i(TAG, "Select model type $secondType for the second pass")
|
||||
|
||||
val config = OfflineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
|
||||
modelConfig = getOfflineModelConfig(type = secondType)!!,
|
||||
)
|
||||
|
||||
offlineRecognizer = SherpaOnnxOffline(
|
||||
offlineRecognizer = OfflineRecognizer(
|
||||
assetManager = application.assets,
|
||||
config = config,
|
||||
)
|
||||
@@ -244,8 +247,15 @@ class MainActivity : AppCompatActivity() {
|
||||
val n = maxOf(0, samples.size - 8000)
|
||||
|
||||
samplesBuffer.clear()
|
||||
samplesBuffer.add(samples.sliceArray(n..samples.size-1))
|
||||
samplesBuffer.add(samples.sliceArray(n until samples.size))
|
||||
|
||||
return offlineRecognizer.decode(samples.sliceArray(0..n), sampleRateInHz)
|
||||
val stream = offlineRecognizer.createStream()
|
||||
stream.acceptWaveform(samples.sliceArray(0..n), sampleRateInHz)
|
||||
offlineRecognizer.decode(stream)
|
||||
val result = offlineRecognizer.getResult(stream)
|
||||
|
||||
stream.release()
|
||||
|
||||
return result.text
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineRecognizer.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineStream.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineRecognizer.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineStream.kt
|
||||
@@ -1,404 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class EndpointRule(
|
||||
var mustContainNonSilence: Boolean,
|
||||
var minTrailingSilence: Float,
|
||||
var minUtteranceLength: Float,
|
||||
)
|
||||
|
||||
data class EndpointConfig(
|
||||
var rule1: EndpointRule = EndpointRule(false, 2.0f, 0.0f),
|
||||
var rule2: EndpointRule = EndpointRule(true, 1.2f, 0.0f),
|
||||
var rule3: EndpointRule = EndpointRule(false, 0.0f, 20.0f)
|
||||
)
|
||||
|
||||
data class OnlineTransducerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var joiner: String = "",
|
||||
)
|
||||
|
||||
data class OnlineParaformerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
)
|
||||
|
||||
data class OnlineZipformer2CtcModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OnlineModelConfig(
|
||||
var transducer: OnlineTransducerModelConfig = OnlineTransducerModelConfig(),
|
||||
var paraformer: OnlineParaformerModelConfig = OnlineParaformerModelConfig(),
|
||||
var zipformer2Ctc: OnlineZipformer2CtcModelConfig = OnlineZipformer2CtcModelConfig(),
|
||||
var tokens: String,
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
var modelType: String = "",
|
||||
)
|
||||
|
||||
data class OnlineLMConfig(
|
||||
var model: String = "",
|
||||
var scale: Float = 0.5f,
|
||||
)
|
||||
|
||||
data class FeatureConfig(
|
||||
var sampleRate: Int = 16000,
|
||||
var featureDim: Int = 80,
|
||||
)
|
||||
|
||||
data class OnlineRecognizerConfig(
|
||||
var featConfig: FeatureConfig = FeatureConfig(),
|
||||
var modelConfig: OnlineModelConfig,
|
||||
var lmConfig: OnlineLMConfig = OnlineLMConfig(),
|
||||
var endpointConfig: EndpointConfig = EndpointConfig(),
|
||||
var enableEndpoint: Boolean = true,
|
||||
var decodingMethod: String = "greedy_search",
|
||||
var maxActivePaths: Int = 4,
|
||||
var hotwordsFile: String = "",
|
||||
var hotwordsScore: Float = 1.5f,
|
||||
)
|
||||
|
||||
data class OfflineTransducerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var joiner: String = "",
|
||||
)
|
||||
|
||||
data class OfflineParaformerModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OfflineWhisperModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var language: String = "en", // Used with multilingual model
|
||||
var task: String = "transcribe", // transcribe or translate
|
||||
var tailPaddings: Int = 1000, // Padding added at the end of the samples
|
||||
)
|
||||
|
||||
data class OfflineModelConfig(
|
||||
var transducer: OfflineTransducerModelConfig = OfflineTransducerModelConfig(),
|
||||
var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(),
|
||||
var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(),
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
var modelType: String = "",
|
||||
var tokens: String,
|
||||
)
|
||||
|
||||
data class OfflineRecognizerConfig(
|
||||
var featConfig: FeatureConfig = FeatureConfig(),
|
||||
var modelConfig: OfflineModelConfig,
|
||||
// var lmConfig: OfflineLMConfig(), // TODO(fangjun): enable it
|
||||
var decodingMethod: String = "greedy_search",
|
||||
var maxActivePaths: Int = 4,
|
||||
var hotwordsFile: String = "",
|
||||
var hotwordsScore: Float = 1.5f,
|
||||
)
|
||||
|
||||
class SherpaOnnx(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: OnlineRecognizerConfig,
|
||||
) {
|
||||
private val ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = new(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
}
|
||||
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
fun inputFinished() = inputFinished(ptr)
|
||||
fun reset(recreate: Boolean = false, hotwords: String = "") = reset(ptr, recreate, hotwords)
|
||||
fun decode() = decode(ptr)
|
||||
fun isEndpoint(): Boolean = isEndpoint(ptr)
|
||||
fun isReady(): Boolean = isReady(ptr)
|
||||
|
||||
val text: String
|
||||
get() = getText(ptr)
|
||||
|
||||
val tokens: Array<String>
|
||||
get() = getTokens(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: OnlineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: OnlineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
private external fun inputFinished(ptr: Long)
|
||||
private external fun getText(ptr: Long): String
|
||||
private external fun reset(ptr: Long, recreate: Boolean, hotwords: String)
|
||||
private external fun decode(ptr: Long)
|
||||
private external fun isEndpoint(ptr: Long): Boolean
|
||||
private external fun isReady(ptr: Long): Boolean
|
||||
private external fun getTokens(ptr: Long): Array<String>
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SherpaOnnxOffline(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: OfflineRecognizerConfig,
|
||||
) {
|
||||
private val ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = new(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
}
|
||||
|
||||
fun decode(samples: FloatArray, sampleRate: Int) = decode(ptr, samples, sampleRate)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: OfflineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: OfflineRecognizerConfig,
|
||||
): Long
|
||||
|
||||
private external fun decode(ptr: Long, samples: FloatArray, sampleRate: Int): String
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun getFeatureConfig(sampleRate: Int, featureDim: Int): FeatureConfig {
|
||||
return FeatureConfig(sampleRate = sampleRate, featureDim = featureDim)
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own. (It should be straightforward to add a new model
|
||||
by following the code)
|
||||
|
||||
@param type
|
||||
0 - csukuangfj/sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23 (Chinese)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-zh-14m-2023-02-23
|
||||
encoder/joiner int8, decoder float32
|
||||
|
||||
1 - csukuangfj/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17 (English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-20m-2023-02-17-english
|
||||
encoder/joiner int8, decoder fp32
|
||||
|
||||
*/
|
||||
fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-99-avg-1.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-99-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-99-avg-1.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own LM model. (It should be straightforward to train a new NN LM model
|
||||
by following the code, https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py)
|
||||
|
||||
@param type
|
||||
0 - sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english
|
||||
*/
|
||||
fun getOnlineLMConfig(type: Int): OnlineLMConfig {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20"
|
||||
return OnlineLMConfig(
|
||||
model = "$modelDir/with-state-epoch-99-avg-1.int8.onnx",
|
||||
scale = 0.5f,
|
||||
)
|
||||
}
|
||||
}
|
||||
return OnlineLMConfig()
|
||||
}
|
||||
|
||||
// for English models, use a small value for rule2.minTrailingSilence, e.g., 0.8
|
||||
fun getEndpointConfig(): EndpointConfig {
|
||||
return EndpointConfig(
|
||||
rule1 = EndpointRule(false, 2.4f, 0.0f),
|
||||
rule2 = EndpointRule(true, 0.8f, 0.0f),
|
||||
rule3 = EndpointRule(false, 0.0f, 20.0f)
|
||||
)
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own. (It should be straightforward to add a new model
|
||||
by following the code)
|
||||
|
||||
@param type
|
||||
|
||||
0 - csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28 (Chinese)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese
|
||||
int8
|
||||
|
||||
1 - icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04 (English)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#icefall-asr-multidataset-pruned-transducer-stateless7-2023-05-04-english
|
||||
encoder int8, decoder/joiner float32
|
||||
|
||||
2 - sherpa-onnx-whisper-tiny.en
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html#tiny-en
|
||||
encoder int8, decoder int8
|
||||
|
||||
3 - sherpa-onnx-whisper-base.en
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html#tiny-en
|
||||
encoder int8, decoder int8
|
||||
|
||||
4 - pkufool/icefall-asr-zipformer-wenetspeech-20230615 (Chinese)
|
||||
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#pkufool-icefall-asr-zipformer-wenetspeech-20230615-chinese
|
||||
encoder/joiner int8, decoder fp32
|
||||
|
||||
*/
|
||||
fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-paraformer-zh-2023-03-28"
|
||||
return OfflineModelConfig(
|
||||
paraformer = OfflineParaformerModelConfig(
|
||||
model = "$modelDir/model.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "paraformer",
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04"
|
||||
return OfflineModelConfig(
|
||||
transducer = OfflineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-30-avg-4.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-30-avg-4.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-30-avg-4.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
|
||||
2 -> {
|
||||
val modelDir = "sherpa-onnx-whisper-tiny.en"
|
||||
return OfflineModelConfig(
|
||||
whisper = OfflineWhisperModelConfig(
|
||||
encoder = "$modelDir/tiny.en-encoder.int8.onnx",
|
||||
decoder = "$modelDir/tiny.en-decoder.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tiny.en-tokens.txt",
|
||||
modelType = "whisper",
|
||||
)
|
||||
}
|
||||
|
||||
3 -> {
|
||||
val modelDir = "sherpa-onnx-whisper-base.en"
|
||||
return OfflineModelConfig(
|
||||
whisper = OfflineWhisperModelConfig(
|
||||
encoder = "$modelDir/base.en-encoder.int8.onnx",
|
||||
decoder = "$modelDir/base.en-decoder.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/base.en-tokens.txt",
|
||||
modelType = "whisper",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
4 -> {
|
||||
val modelDir = "icefall-asr-zipformer-wenetspeech-20230615"
|
||||
return OfflineModelConfig(
|
||||
transducer = OfflineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-12-avg-4.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-12-avg-4.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-12-avg-4.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer",
|
||||
)
|
||||
}
|
||||
|
||||
5 -> {
|
||||
val modelDir = "sherpa-onnx-zipformer-multi-zh-hans-2023-9-2"
|
||||
return OfflineModelConfig(
|
||||
transducer = OfflineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-20-avg-1.int8.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-20-avg-1.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-20-avg-1.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
return null
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
class WaveReader {
|
||||
companion object {
|
||||
// Read a mono wave file asset
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromAsset(
|
||||
assetManager: AssetManager,
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
// Read a mono wave file from disk
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromFile(
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,188 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
const val TAG = "sherpa-onnx"
|
||||
|
||||
data class OfflineZipformerAudioTaggingModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class AudioTaggingModelConfig(
|
||||
var zipformer: OfflineZipformerAudioTaggingModelConfig = OfflineZipformerAudioTaggingModelConfig(),
|
||||
var ced: String = "",
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
)
|
||||
|
||||
data class AudioTaggingConfig(
|
||||
var model: AudioTaggingModelConfig,
|
||||
var labels: String,
|
||||
var topK: Int = 5,
|
||||
)
|
||||
|
||||
data class AudioEvent(
|
||||
val name: String,
|
||||
val index: Int,
|
||||
val prob: Float,
|
||||
)
|
||||
|
||||
class AudioTagging(
|
||||
assetManager: AssetManager? = null,
|
||||
config: AudioTaggingConfig,
|
||||
) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
fun createStream(): OfflineStream {
|
||||
val p = createStream(ptr)
|
||||
return OfflineStream(p)
|
||||
}
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
fun compute(stream: OfflineStream, topK: Int = -1): ArrayList<AudioEvent> {
|
||||
val events: Array<Any> = compute(ptr, stream.ptr, topK)
|
||||
val ans = ArrayList<AudioEvent>()
|
||||
|
||||
for (e in events) {
|
||||
val p: Array<Any> = e as Array<Any>
|
||||
ans.add(
|
||||
AudioEvent(
|
||||
name = p[0] as String,
|
||||
index = p[1] as Int,
|
||||
prob = p[2] as Float,
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
return ans
|
||||
}
|
||||
|
||||
private external fun newFromAsset(
|
||||
assetManager: AssetManager,
|
||||
config: AudioTaggingConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: AudioTaggingConfig,
|
||||
): Long
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun createStream(ptr: Long): Long
|
||||
|
||||
private external fun compute(ptr: Long, streamPtr: Long, topK: Int): Array<Any>
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// please refer to
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
|
||||
// to download more models
|
||||
//
|
||||
// See also
|
||||
// https://k2-fsa.github.io/sherpa/onnx/audio-tagging/
|
||||
fun getAudioTaggingConfig(type: Int, numThreads: Int = 1): AudioTaggingConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-zipformer-small-audio-tagging-2024-04-15"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
zipformer = OfflineZipformerAudioTaggingModelConfig(model = "$modelDir/model.int8.onnx"),
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-zipformer-audio-tagging-2024-04-09"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
zipformer = OfflineZipformerAudioTaggingModelConfig(model = "$modelDir/model.int8.onnx"),
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
|
||||
2 -> {
|
||||
val modelDir = "sherpa-onnx-ced-tiny-audio-tagging-2024-04-19"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
ced = "$modelDir/model.int8.onnx",
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
|
||||
3 -> {
|
||||
val modelDir = "sherpa-onnx-ced-mini-audio-tagging-2024-04-19"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
ced = "$modelDir/model.int8.onnx",
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
|
||||
4 -> {
|
||||
val modelDir = "sherpa-onnx-ced-small-audio-tagging-2024-04-19"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
ced = "$modelDir/model.int8.onnx",
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
|
||||
5 -> {
|
||||
val modelDir = "sherpa-onnx-ced-base-audio-tagging-2024-04-19"
|
||||
return AudioTaggingConfig(
|
||||
model = AudioTaggingModelConfig(
|
||||
ced = "$modelDir/model.int8.onnx",
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
),
|
||||
labels = "$modelDir/class_labels_indices.csv",
|
||||
topK = 3,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../../../sherpa-onnx/kotlin-api/AudioTagging.kt
|
||||
@@ -46,7 +46,6 @@ import androidx.compose.ui.unit.dp
|
||||
import androidx.compose.ui.unit.sp
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.AudioEvent
|
||||
import com.k2fsa.sherpa.onnx.Tagger
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
|
||||
|
||||
@@ -13,13 +13,14 @@ import androidx.compose.material3.Surface
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.Tagger
|
||||
import com.k2fsa.sherpa.onnx.audio.tagging.ui.theme.SherpaOnnxAudioTaggingTheme
|
||||
|
||||
const val TAG = "sherpa-onnx"
|
||||
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
// adb emu avd hostmicon
|
||||
// to enable mic inside the emulator
|
||||
class MainActivity : ComponentActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
class OfflineStream(var ptr: Long) {
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineStream.kt
|
||||
@@ -1,7 +1,9 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
package com.k2fsa.sherpa.onnx.audio.tagging
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
import com.k2fsa.sherpa.onnx.AudioTagging
|
||||
import com.k2fsa.sherpa.onnx.getAudioTaggingConfig
|
||||
|
||||
|
||||
object Tagger {
|
||||
@@ -17,7 +19,7 @@ object Tagger {
|
||||
return
|
||||
}
|
||||
|
||||
Log.i(TAG, "Initializing audio tagger")
|
||||
Log.i("sherpa-onnx", "Initializing audio tagger")
|
||||
val config = getAudioTaggingConfig(type = 0, numThreads = numThreads)!!
|
||||
_tagger = AudioTagging(assetManager, config)
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ import androidx.wear.compose.material.Button
|
||||
import androidx.wear.compose.material.MaterialTheme
|
||||
import androidx.wear.compose.material.Text
|
||||
import com.k2fsa.sherpa.onnx.AudioEvent
|
||||
import com.k2fsa.sherpa.onnx.Tagger
|
||||
import com.k2fsa.sherpa.onnx.audio.tagging.Tagger
|
||||
import com.k2fsa.sherpa.onnx.audio.tagging.wear.os.presentation.theme.SherpaOnnxAudioTaggingWearOsTheme
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
|
||||
@@ -17,11 +17,14 @@ import androidx.activity.compose.setContent
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.core.app.ActivityCompat
|
||||
import androidx.core.splashscreen.SplashScreen.Companion.installSplashScreen
|
||||
import com.k2fsa.sherpa.onnx.Tagger
|
||||
import com.k2fsa.sherpa.onnx.audio.tagging.Tagger
|
||||
|
||||
const val TAG = "sherpa-onnx"
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
// adb emu avd hostmicon
|
||||
// to enable mic inside the emulator
|
||||
|
||||
class MainActivity : ComponentActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
android:theme="@style/Theme.SherpaOnnx"
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:name=".kws.MainActivity"
|
||||
android:label="Keyword-spotter"
|
||||
android:exported="true">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/FeatureConfig.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/KeywordSpotter.kt
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
package com.k2fsa.sherpa.onnx.kws
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
@@ -14,7 +14,13 @@ import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
import com.k2fsa.sherpa.onnx.KeywordSpotter
|
||||
import com.k2fsa.sherpa.onnx.KeywordSpotterConfig
|
||||
import com.k2fsa.sherpa.onnx.OnlineStream
|
||||
import com.k2fsa.sherpa.onnx.R
|
||||
import com.k2fsa.sherpa.onnx.getFeatureConfig
|
||||
import com.k2fsa.sherpa.onnx.getKeywordsFile
|
||||
import com.k2fsa.sherpa.onnx.getKwsModelConfig
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
private const val TAG = "sherpa-onnx"
|
||||
@@ -23,7 +29,8 @@ private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
class MainActivity : AppCompatActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
private lateinit var model: SherpaOnnxKws
|
||||
private lateinit var kws: KeywordSpotter
|
||||
private lateinit var stream: OnlineStream
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private lateinit var recordButton: Button
|
||||
private lateinit var textView: TextView
|
||||
@@ -87,15 +94,18 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
Log.i(TAG, keywords)
|
||||
keywords = keywords.replace("\n", "/")
|
||||
keywords = keywords.trim()
|
||||
// If keywords is an empty string, it just resets the decoding stream
|
||||
// always returns true in this case.
|
||||
// If keywords is not empty, it will create a new decoding stream with
|
||||
// the given keywords appended to the default keywords.
|
||||
// Return false if errors occured when adding keywords, true otherwise.
|
||||
val status = model.reset(keywords)
|
||||
if (!status) {
|
||||
Log.i(TAG, "Failed to reset with keywords.")
|
||||
Toast.makeText(this, "Failed to set keywords.", Toast.LENGTH_LONG).show();
|
||||
// Return false if errors occurred when adding keywords, true otherwise.
|
||||
stream.release()
|
||||
stream = kws.createStream(keywords)
|
||||
if (stream.ptr == 0L) {
|
||||
Log.i(TAG, "Failed to create stream with keywords: $keywords")
|
||||
Toast.makeText(this, "Failed to set keywords to $keywords.", Toast.LENGTH_LONG)
|
||||
.show()
|
||||
return
|
||||
}
|
||||
|
||||
@@ -122,6 +132,7 @@ class MainActivity : AppCompatActivity() {
|
||||
audioRecord!!.release()
|
||||
audioRecord = null
|
||||
recordButton.setText(R.string.start)
|
||||
stream.release()
|
||||
Log.i(TAG, "Stopped recording")
|
||||
}
|
||||
}
|
||||
@@ -137,22 +148,22 @@ class MainActivity : AppCompatActivity() {
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size)
|
||||
if (ret != null && ret > 0) {
|
||||
val samples = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
model.acceptWaveform(samples, sampleRate=sampleRateInHz)
|
||||
while (model.isReady()) {
|
||||
model.decode()
|
||||
stream.acceptWaveform(samples, sampleRate = sampleRateInHz)
|
||||
while (kws.isReady(stream)) {
|
||||
kws.decode(stream)
|
||||
}
|
||||
|
||||
val text = model.keyword
|
||||
val text = kws.getResult(stream).keyword
|
||||
|
||||
var textToDisplay = lastText;
|
||||
var textToDisplay = lastText
|
||||
|
||||
if(text.isNotBlank()) {
|
||||
if (text.isNotBlank()) {
|
||||
if (lastText.isBlank()) {
|
||||
textToDisplay = "${idx}: ${text}"
|
||||
textToDisplay = "$idx: $text"
|
||||
} else {
|
||||
textToDisplay = "${idx}: ${text}\n${lastText}"
|
||||
textToDisplay = "$idx: $text\n$lastText"
|
||||
}
|
||||
lastText = "${idx}: ${text}\n${lastText}"
|
||||
lastText = "$idx: $text\n$lastText"
|
||||
idx += 1
|
||||
}
|
||||
|
||||
@@ -188,20 +199,21 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
private fun initModel() {
|
||||
// Please change getModelConfig() to add new models
|
||||
// Please change getKwsModelConfig() to add new models
|
||||
// See https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html
|
||||
// for a list of available models
|
||||
val type = 0
|
||||
Log.i(TAG, "Select model type ${type}")
|
||||
Log.i(TAG, "Select model type $type")
|
||||
val config = KeywordSpotterConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
|
||||
modelConfig = getModelConfig(type = type)!!,
|
||||
keywordsFile = getKeywordsFile(type = type)!!,
|
||||
modelConfig = getKwsModelConfig(type = type)!!,
|
||||
keywordsFile = getKeywordsFile(type = type),
|
||||
)
|
||||
|
||||
model = SherpaOnnxKws(
|
||||
kws = KeywordSpotter(
|
||||
assetManager = application.assets,
|
||||
config = config,
|
||||
)
|
||||
stream = kws.createStream()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineRecognizer.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineStream.kt
|
||||
@@ -1,162 +0,0 @@
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class OnlineTransducerModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var joiner: String = "",
|
||||
)
|
||||
|
||||
data class OnlineModelConfig(
|
||||
var transducer: OnlineTransducerModelConfig = OnlineTransducerModelConfig(),
|
||||
var tokens: String,
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
var modelType: String = "",
|
||||
)
|
||||
|
||||
data class FeatureConfig(
|
||||
var sampleRate: Int = 16000,
|
||||
var featureDim: Int = 80,
|
||||
)
|
||||
|
||||
data class KeywordSpotterConfig(
|
||||
var featConfig: FeatureConfig = FeatureConfig(),
|
||||
var modelConfig: OnlineModelConfig,
|
||||
var maxActivePaths: Int = 4,
|
||||
var keywordsFile: String = "keywords.txt",
|
||||
var keywordsScore: Float = 1.5f,
|
||||
var keywordsThreshold: Float = 0.25f,
|
||||
var numTrailingBlanks: Int = 2,
|
||||
)
|
||||
|
||||
class SherpaOnnxKws(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: KeywordSpotterConfig,
|
||||
) {
|
||||
private val ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = new(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
}
|
||||
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
fun inputFinished() = inputFinished(ptr)
|
||||
fun decode() = decode(ptr)
|
||||
fun isReady(): Boolean = isReady(ptr)
|
||||
fun reset(keywords: String): Boolean = reset(ptr, keywords)
|
||||
|
||||
val keyword: String
|
||||
get() = getKeyword(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: KeywordSpotterConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: KeywordSpotterConfig,
|
||||
): Long
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
private external fun inputFinished(ptr: Long)
|
||||
private external fun getKeyword(ptr: Long): String
|
||||
private external fun reset(ptr: Long, keywords: String): Boolean
|
||||
private external fun decode(ptr: Long)
|
||||
private external fun isReady(ptr: Long): Boolean
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun getFeatureConfig(sampleRate: Int, featureDim: Int): FeatureConfig {
|
||||
return FeatureConfig(sampleRate = sampleRate, featureDim = featureDim)
|
||||
}
|
||||
|
||||
/*
|
||||
Please see
|
||||
https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html
|
||||
for a list of pre-trained models.
|
||||
|
||||
We only add a few here. Please change the following code
|
||||
to add your own. (It should be straightforward to add a new model
|
||||
by following the code)
|
||||
|
||||
@param type
|
||||
0 - sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 (Chinese)
|
||||
https://www.modelscope.cn/models/pkufool/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/summary
|
||||
|
||||
1 - sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01 (English)
|
||||
https://www.modelscope.cn/models/pkufool/sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01/summary
|
||||
|
||||
*/
|
||||
fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01"
|
||||
return OnlineModelConfig(
|
||||
transducer = OnlineTransducerModelConfig(
|
||||
encoder = "$modelDir/encoder-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
decoder = "$modelDir/decoder-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
joiner = "$modelDir/joiner-epoch-12-avg-2-chunk-16-left-64.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the default keywords for each model.
|
||||
* Caution: The types and modelDir should be the same as those in getModelConfig
|
||||
* function above.
|
||||
*/
|
||||
fun getKeywordsFile(type: Int) : String {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01"
|
||||
return "$modelDir/keywords.txt"
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01"
|
||||
return "$modelDir/keywords.txt"
|
||||
}
|
||||
|
||||
}
|
||||
return "";
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
class WaveReader {
|
||||
companion object {
|
||||
// Read a mono wave file asset
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromAsset(
|
||||
assetManager: AssetManager,
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
// Read a mono wave file from disk
|
||||
// The returned array has two entries:
|
||||
// - the first entry contains an 1-D float array
|
||||
// - the second entry is the sample rate
|
||||
external fun readWaveFromFile(
|
||||
filename: String,
|
||||
): Array<Any>
|
||||
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,12 @@
|
||||
<resources>
|
||||
<string name="app_name">KWS with Next-gen Kaldi</string>
|
||||
<string name="app_name">Keyword spotting</string>
|
||||
<string name="hint">Click the Start button to play keyword spotting with Next-gen Kaldi.
|
||||
\n
|
||||
\n\n\n
|
||||
The source code and pre-trained models are publicly available.
|
||||
Please see https://github.com/k2-fsa/sherpa-onnx for details.
|
||||
</string>
|
||||
<string name="keyword_hint">Input your keywords here, one keyword perline.</string>
|
||||
<string name="keyword_hint">Input your keywords here, one keyword per line.\nTwo example keywords are given below:\n\nn ǐ h ǎo @你好\nd àn g ē d àn g ē @蛋哥蛋哥</string>
|
||||
<string name="start">Start</string>
|
||||
<string name="stop">Stop</string>
|
||||
</resources>
|
||||
|
||||
@@ -2,7 +2,7 @@ package com.k2fsa.sherpa.onnx.speaker.identification
|
||||
|
||||
import androidx.compose.ui.graphics.vector.ImageVector
|
||||
|
||||
data class BarItem (
|
||||
data class BarItem(
|
||||
val title: String,
|
||||
|
||||
// see https://www.composables.com/icons
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
package com.k2fsa.sherpa.onnx.speaker.identification
|
||||
|
||||
sealed class NavRoutes(val route: String) {
|
||||
object Home: NavRoutes("home")
|
||||
object Register: NavRoutes("register")
|
||||
object View: NavRoutes("view")
|
||||
object Help: NavRoutes("help")
|
||||
object Home : NavRoutes("home")
|
||||
object Register : NavRoutes("register")
|
||||
object View : NavRoutes("view")
|
||||
object Help : NavRoutes("help")
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../../../sherpa-onnx/kotlin-api/OnlineStream.kt
|
||||
@@ -1,188 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
|
||||
private val TAG = "sherpa-onnx"
|
||||
data class SpeakerEmbeddingExtractorConfig(
|
||||
val model: String,
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
)
|
||||
|
||||
class SpeakerEmbeddingExtractorStream(var ptr: Long) {
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
fun inputFinished() = inputFinished(ptr)
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
|
||||
private external fun myTest(ptr: Long, v: Array<FloatArray>)
|
||||
|
||||
fun release() = finalize()
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
|
||||
private external fun inputFinished(ptr: Long)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SpeakerEmbeddingExtractor(
|
||||
assetManager: AssetManager? = null,
|
||||
config: SpeakerEmbeddingExtractorConfig,
|
||||
) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
ptr = if (assetManager != null) {
|
||||
new(assetManager, config)
|
||||
} else {
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
fun createStream(): SpeakerEmbeddingExtractorStream {
|
||||
val p = createStream(ptr)
|
||||
return SpeakerEmbeddingExtractorStream(p)
|
||||
}
|
||||
|
||||
fun isReady(stream: SpeakerEmbeddingExtractorStream) = isReady(ptr, stream.ptr)
|
||||
fun compute(stream: SpeakerEmbeddingExtractorStream) = compute(ptr, stream.ptr)
|
||||
fun dim() = dim(ptr)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: SpeakerEmbeddingExtractorConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: SpeakerEmbeddingExtractorConfig,
|
||||
): Long
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun createStream(ptr: Long): Long
|
||||
|
||||
private external fun isReady(ptr: Long, streamPtr: Long): Boolean
|
||||
|
||||
private external fun compute(ptr: Long, streamPtr: Long): FloatArray
|
||||
|
||||
private external fun dim(ptr: Long): Int
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SpeakerEmbeddingManager(val dim: Int) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
ptr = new(dim)
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
fun add(name: String, embedding: FloatArray) = add(ptr, name, embedding)
|
||||
fun add(name: String, embedding: Array<FloatArray>) = addList(ptr, name, embedding)
|
||||
fun remove(name: String) = remove(ptr, name)
|
||||
fun search(embedding: FloatArray, threshold: Float) = search(ptr, embedding, threshold)
|
||||
fun verify(name: String, embedding: FloatArray, threshold: Float) =
|
||||
verify(ptr, name, embedding, threshold)
|
||||
|
||||
fun contains(name: String) = contains(ptr, name)
|
||||
fun numSpeakers() = numSpeakers(ptr)
|
||||
|
||||
fun allSpeakerNames() = allSpeakerNames(ptr)
|
||||
|
||||
private external fun new(dim: Int): Long
|
||||
private external fun delete(ptr: Long): Unit
|
||||
private external fun add(ptr: Long, name: String, embedding: FloatArray): Boolean
|
||||
private external fun addList(ptr: Long, name: String, embedding: Array<FloatArray>): Boolean
|
||||
private external fun remove(ptr: Long, name: String): Boolean
|
||||
private external fun search(ptr: Long, embedding: FloatArray, threshold: Float): String
|
||||
private external fun verify(
|
||||
ptr: Long,
|
||||
name: String,
|
||||
embedding: FloatArray,
|
||||
threshold: Float
|
||||
): Boolean
|
||||
|
||||
private external fun contains(ptr: Long, name: String): Boolean
|
||||
private external fun numSpeakers(ptr: Long): Int
|
||||
|
||||
private external fun allSpeakerNames(ptr: Long): Array<String>
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Please download the model file from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||
// and put it inside the assets directory.
|
||||
//
|
||||
// Please don't put it in a subdirectory of assets
|
||||
private val modelName = "3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"
|
||||
|
||||
object SpeakerRecognition {
|
||||
var _extractor: SpeakerEmbeddingExtractor? = null
|
||||
var _manager: SpeakerEmbeddingManager? = null
|
||||
|
||||
val extractor: SpeakerEmbeddingExtractor
|
||||
get() {
|
||||
return _extractor!!
|
||||
}
|
||||
|
||||
val manager: SpeakerEmbeddingManager
|
||||
get() {
|
||||
return _manager!!
|
||||
}
|
||||
|
||||
fun initExtractor(assetManager: AssetManager? = null) {
|
||||
synchronized(this) {
|
||||
if (_extractor != null) {
|
||||
return
|
||||
}
|
||||
Log.i(TAG, "Initializing speaker embedding extractor")
|
||||
|
||||
_extractor = SpeakerEmbeddingExtractor(
|
||||
assetManager = assetManager,
|
||||
config = SpeakerEmbeddingExtractorConfig(
|
||||
model = modelName,
|
||||
numThreads = 2,
|
||||
debug = false,
|
||||
provider = "cpu",
|
||||
)
|
||||
)
|
||||
|
||||
_manager = SpeakerEmbeddingManager(dim = _extractor!!.dim())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../../../sherpa-onnx/kotlin-api/Speaker.kt
|
||||
@@ -1,4 +1,4 @@
|
||||
@file:OptIn(ExperimentalMaterial3Api::class, ExperimentalFoundationApi::class)
|
||||
@file:OptIn(ExperimentalMaterial3Api::class)
|
||||
|
||||
package com.k2fsa.sherpa.onnx.slid
|
||||
|
||||
@@ -9,11 +9,9 @@ import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import android.util.Log
|
||||
import androidx.compose.foundation.ExperimentalFoundationApi
|
||||
import androidx.compose.foundation.layout.Box
|
||||
import androidx.compose.foundation.layout.Column
|
||||
import androidx.compose.foundation.layout.PaddingValues
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.foundation.layout.Spacer
|
||||
import androidx.compose.foundation.layout.fillMaxSize
|
||||
import androidx.compose.foundation.layout.height
|
||||
@@ -31,6 +29,7 @@ import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.remember
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.compose.ui.Alignment
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.platform.LocalContext
|
||||
import androidx.compose.ui.text.font.FontWeight
|
||||
import androidx.compose.ui.unit.dp
|
||||
@@ -63,13 +62,13 @@ fun Home() {
|
||||
}
|
||||
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private val sampleRateInHz = 16000
|
||||
private const val sampleRateInHz = 16000
|
||||
|
||||
@Composable
|
||||
fun MyApp(padding: PaddingValues) {
|
||||
val activity = LocalContext.current as Activity
|
||||
var isStarted by remember { mutableStateOf(false) }
|
||||
var result by remember { mutableStateOf<String>("") }
|
||||
var result by remember { mutableStateOf("") }
|
||||
|
||||
val onButtonClick: () -> Unit = {
|
||||
isStarted = !isStarted
|
||||
@@ -114,12 +113,12 @@ fun MyApp(padding: PaddingValues) {
|
||||
}
|
||||
Log.i(TAG, "Stop recording")
|
||||
Log.i(TAG, "Start recognition")
|
||||
val samples = Flatten(sampleList)
|
||||
val samples = flatten(sampleList)
|
||||
val stream = Slid.slid.createStream()
|
||||
stream.acceptWaveform(samples, sampleRateInHz)
|
||||
val lang = Slid.slid.compute(stream)
|
||||
|
||||
result = Slid.localeMap.get(lang) ?: lang
|
||||
result = Slid.localeMap[lang] ?: lang
|
||||
|
||||
stream.release()
|
||||
}
|
||||
@@ -152,7 +151,7 @@ fun MyApp(padding: PaddingValues) {
|
||||
}
|
||||
}
|
||||
|
||||
fun Flatten(sampleList: ArrayList<FloatArray>): FloatArray {
|
||||
fun flatten(sampleList: ArrayList<FloatArray>): FloatArray {
|
||||
var totalSamples = 0
|
||||
for (a in sampleList) {
|
||||
totalSamples += a.size
|
||||
|
||||
@@ -10,12 +10,9 @@ import androidx.activity.compose.setContent
|
||||
import androidx.compose.foundation.layout.fillMaxSize
|
||||
import androidx.compose.material3.MaterialTheme
|
||||
import androidx.compose.material3.Surface
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.tooling.preview.Preview
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.SpokenLanguageIdentification
|
||||
import com.k2fsa.sherpa.onnx.slid.ui.theme.SherpaOnnxSpokenLanguageIdentificationTheme
|
||||
|
||||
const val TAG = "sherpa-onnx"
|
||||
@@ -32,6 +29,7 @@ class MainActivity : ComponentActivity() {
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
Slid.initSlid(this.assets)
|
||||
}
|
||||
|
||||
@Suppress("DEPRECATION")
|
||||
@Deprecated("Deprecated in Java")
|
||||
override fun onRequestPermissionsResult(
|
||||
|
||||
@@ -1 +1 @@
|
||||
../../../../../../../../../../SherpaOnnxAudioTagging/app/src/main/java/com/k2fsa/sherpa/onnx/audio/tagging/OfflineStream.kt
|
||||
../../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineStream.kt
|
||||
@@ -1,102 +0,0 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
|
||||
private val TAG = "sherpa-onnx"
|
||||
|
||||
data class SpokenLanguageIdentificationWhisperConfig (
|
||||
var encoder: String,
|
||||
var decoder: String,
|
||||
var tailPaddings: Int = -1,
|
||||
)
|
||||
|
||||
data class SpokenLanguageIdentificationConfig (
|
||||
var whisper: SpokenLanguageIdentificationWhisperConfig,
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
)
|
||||
|
||||
class SpokenLanguageIdentification (
|
||||
assetManager: AssetManager? = null,
|
||||
config: SpokenLanguageIdentificationConfig,
|
||||
) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
fun createStream(): OfflineStream {
|
||||
val p = createStream(ptr)
|
||||
return OfflineStream(p)
|
||||
}
|
||||
|
||||
fun compute(stream: OfflineStream) = compute(ptr, stream.ptr)
|
||||
|
||||
private external fun newFromAsset(
|
||||
assetManager: AssetManager,
|
||||
config: SpokenLanguageIdentificationConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: SpokenLanguageIdentificationConfig,
|
||||
): Long
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun createStream(ptr: Long): Long
|
||||
|
||||
private external fun compute(ptr: Long, streamPtr: Long): String
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
// please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/spolken-language-identification/pretrained_models.html#whisper
|
||||
// to download more models
|
||||
fun getSpokenLanguageIdentificationConfig(type: Int, numThreads: Int=1): SpokenLanguageIdentificationConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
val modelDir = "sherpa-onnx-whisper-tiny"
|
||||
return SpokenLanguageIdentificationConfig(
|
||||
whisper = SpokenLanguageIdentificationWhisperConfig(
|
||||
encoder = "$modelDir/tiny-encoder.int8.onnx",
|
||||
decoder = "$modelDir/tiny-decoder.int8.onnx",
|
||||
),
|
||||
numThreads = numThreads,
|
||||
debug = true,
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
val modelDir = "sherpa-onnx-whisper-base"
|
||||
return SpokenLanguageIdentificationConfig(
|
||||
whisper = SpokenLanguageIdentificationWhisperConfig(
|
||||
encoder = "$modelDir/tiny-encoder.int8.onnx",
|
||||
decoder = "$modelDir/tiny-decoder.int8.onnx",
|
||||
),
|
||||
numThreads = 1,
|
||||
debug = true,
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../../sherpa-onnx/kotlin-api/SpokenLanguageIdentification.kt
|
||||
@@ -15,10 +15,10 @@ object Slid {
|
||||
get() {
|
||||
return _slid!!
|
||||
}
|
||||
val localeMap : Map<String, String>
|
||||
get() {
|
||||
return _localeMap
|
||||
}
|
||||
val localeMap: Map<String, String>
|
||||
get() {
|
||||
return _localeMap
|
||||
}
|
||||
|
||||
fun initSlid(assetManager: AssetManager? = null, numThreads: Int = 1) {
|
||||
synchronized(this) {
|
||||
@@ -31,7 +31,7 @@ object Slid {
|
||||
}
|
||||
|
||||
if (_localeMap.isEmpty()) {
|
||||
val allLang = Locale.getISOLanguages();
|
||||
val allLang = Locale.getISOLanguages()
|
||||
for (lang in allLang) {
|
||||
val locale = Locale(lang)
|
||||
_localeMap[lang] = locale.displayName
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.media.*
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioTrack
|
||||
import android.media.MediaPlayer
|
||||
import android.net.Uri
|
||||
import android.os.Bundle
|
||||
import android.util.Log
|
||||
@@ -212,7 +216,7 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
if (dictDir != null) {
|
||||
val newDir = copyDataDir( modelDir!!)
|
||||
val newDir = copyDataDir(modelDir!!)
|
||||
modelDir = newDir + "/" + modelDir
|
||||
dictDir = modelDir + "/" + "dict"
|
||||
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst"
|
||||
@@ -220,7 +224,9 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
val config = getOfflineTtsConfig(
|
||||
modelDir = modelDir!!, modelName = modelName!!, lexicon = lexicon ?: "",
|
||||
modelDir = modelDir!!,
|
||||
modelName = modelName!!,
|
||||
lexicon = lexicon ?: "",
|
||||
dataDir = dataDir ?: "",
|
||||
dictDir = dictDir ?: "",
|
||||
ruleFsts = ruleFsts ?: "",
|
||||
@@ -232,11 +238,11 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
|
||||
private fun copyDataDir(dataDir: String): String {
|
||||
println("data dir is $dataDir")
|
||||
Log.i(TAG, "data dir is $dataDir")
|
||||
copyAssets(dataDir)
|
||||
|
||||
val newDataDir = application.getExternalFilesDir(null)!!.absolutePath
|
||||
println("newDataDir: $newDataDir")
|
||||
Log.i(TAG, "newDataDir: $newDataDir")
|
||||
return newDataDir
|
||||
}
|
||||
|
||||
@@ -256,7 +262,7 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
} catch (ex: IOException) {
|
||||
Log.e(TAG, "Failed to copy $path. ${ex.toString()}")
|
||||
Log.e(TAG, "Failed to copy $path. $ex")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,7 +282,7 @@ class MainActivity : AppCompatActivity() {
|
||||
ostream.flush()
|
||||
ostream.close()
|
||||
} catch (ex: Exception) {
|
||||
Log.e(TAG, "Failed to copy $filename, ${ex.toString()}")
|
||||
Log.e(TAG, "Failed to copy $filename, $ex")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,10 +49,10 @@ class OfflineTts(
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = newFromAsset(assetManager, config)
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@ class OfflineTts(
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f
|
||||
): GeneratedAudio {
|
||||
var objArray = generateImpl(ptr, text = text, sid = sid, speed = speed)
|
||||
val objArray = generateImpl(ptr, text = text, sid = sid, speed = speed)
|
||||
return GeneratedAudio(
|
||||
samples = objArray[0] as FloatArray,
|
||||
sampleRate = objArray[1] as Int
|
||||
@@ -78,7 +78,13 @@ class OfflineTts(
|
||||
speed: Float = 1.0f,
|
||||
callback: (samples: FloatArray) -> Unit
|
||||
): GeneratedAudio {
|
||||
var objArray = generateWithCallbackImpl(ptr, text = text, sid = sid, speed = speed, callback=callback)
|
||||
val objArray = generateWithCallbackImpl(
|
||||
ptr,
|
||||
text = text,
|
||||
sid = sid,
|
||||
speed = speed,
|
||||
callback = callback
|
||||
)
|
||||
return GeneratedAudio(
|
||||
samples = objArray[0] as FloatArray,
|
||||
sampleRate = objArray[1] as Int
|
||||
@@ -87,10 +93,10 @@ class OfflineTts(
|
||||
|
||||
fun allocate(assetManager: AssetManager? = null) {
|
||||
if (ptr == 0L) {
|
||||
if (assetManager != null) {
|
||||
ptr = newFromAsset(assetManager, config)
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,9 +109,14 @@ class OfflineTts(
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
private external fun newFromAsset(
|
||||
assetManager: AssetManager,
|
||||
config: OfflineTtsConfig,
|
||||
@@ -123,14 +134,14 @@ class OfflineTts(
|
||||
// - the first entry is an 1-D float array containing audio samples.
|
||||
// Each sample is normalized to the range [-1, 1]
|
||||
// - the second entry is the sample rate
|
||||
external fun generateImpl(
|
||||
private external fun generateImpl(
|
||||
ptr: Long,
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f
|
||||
): Array<Any>
|
||||
|
||||
external fun generateWithCallbackImpl(
|
||||
private external fun generateWithCallbackImpl(
|
||||
ptr: Long,
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
@@ -156,7 +167,7 @@ fun getOfflineTtsConfig(
|
||||
dictDir: String,
|
||||
ruleFsts: String,
|
||||
ruleFars: String
|
||||
): OfflineTtsConfig? {
|
||||
): OfflineTtsConfig {
|
||||
return OfflineTtsConfig(
|
||||
model = OfflineTtsModelConfig(
|
||||
vits = OfflineTtsVitsModelConfig(
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
package com.k2fsa.sherpa.onnx.tts.engine
|
||||
|
||||
import android.content.Intent
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import android.os.Bundle
|
||||
import android.speech.tts.TextToSpeech
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
|
||||
class CheckVoiceData : AppCompatActivity() {
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
val intent = Intent().apply {
|
||||
putStringArrayListExtra(TextToSpeech.Engine.EXTRA_AVAILABLE_VOICES, arrayListOf(TtsEngine.lang))
|
||||
putStringArrayListExtra(
|
||||
TextToSpeech.Engine.EXTRA_AVAILABLE_VOICES,
|
||||
arrayListOf(TtsEngine.lang)
|
||||
)
|
||||
putStringArrayListExtra(TextToSpeech.Engine.EXTRA_UNAVAILABLE_VOICES, arrayListOf())
|
||||
}
|
||||
setResult(TextToSpeech.Engine.CHECK_VOICE_DATA_PASS, intent)
|
||||
|
||||
@@ -2,7 +2,6 @@ package com.k2fsa.sherpa.onnx.tts.engine
|
||||
|
||||
import android.app.Activity
|
||||
import android.content.Intent
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import android.os.Bundle
|
||||
import android.speech.tts.TextToSpeech
|
||||
|
||||
@@ -12,120 +11,168 @@ fun getSampleText(lang: String): String {
|
||||
"ara" -> {
|
||||
text = "هذا هو محرك تحويل النص إلى كلام باستخدام الجيل القادم من كالدي"
|
||||
}
|
||||
|
||||
"ben" -> {
|
||||
text = "এটি একটি টেক্সট-টু-স্পীচ ইঞ্জিন যা পরবর্তী প্রজন্মের কালডি ব্যবহার করে"
|
||||
}
|
||||
|
||||
"bul" -> {
|
||||
text = "Това е машина за преобразуване на текст в реч, използваща Kaldi от следващо поколение"
|
||||
text =
|
||||
"Това е машина за преобразуване на текст в реч, използваща Kaldi от следващо поколение"
|
||||
}
|
||||
|
||||
"cat" -> {
|
||||
text = "Aquest és un motor de text a veu que utilitza Kaldi de nova generació"
|
||||
}
|
||||
|
||||
"ces" -> {
|
||||
text = "Toto je převodník textu na řeč využívající novou generaci kaldi"
|
||||
}
|
||||
|
||||
"dan" -> {
|
||||
text = "Dette er en tekst til tale-motor, der bruger næste generation af kaldi"
|
||||
}
|
||||
|
||||
"deu" -> {
|
||||
text = "Dies ist eine Text-to-Speech-Engine, die Kaldi der nächsten Generation verwendet"
|
||||
text =
|
||||
"Dies ist eine Text-to-Speech-Engine, die Kaldi der nächsten Generation verwendet"
|
||||
}
|
||||
|
||||
"ell" -> {
|
||||
text = "Αυτή είναι μια μηχανή κειμένου σε ομιλία που χρησιμοποιεί kaldi επόμενης γενιάς"
|
||||
}
|
||||
|
||||
"eng" -> {
|
||||
text = "This is a text-to-speech engine using next generation Kaldi"
|
||||
}
|
||||
|
||||
"est" -> {
|
||||
text = "See on teksti kõneks muutmise mootor, mis kasutab järgmise põlvkonna Kaldi"
|
||||
}
|
||||
|
||||
"fin" -> {
|
||||
text = "Tämä on tekstistä puheeksi -moottori, joka käyttää seuraavan sukupolven kaldia"
|
||||
}
|
||||
|
||||
"fra" -> {
|
||||
text = "Il s'agit d'un moteur de synthèse vocale utilisant Kaldi de nouvelle génération"
|
||||
}
|
||||
|
||||
"gle" -> {
|
||||
text = "Is inneall téacs-go-hurlabhra é seo a úsáideann Kaldi den chéad ghlúin eile"
|
||||
}
|
||||
|
||||
"hrv" -> {
|
||||
text = "Ovo je mehanizam za pretvaranje teksta u govor koji koristi Kaldi sljedeće generacije"
|
||||
text =
|
||||
"Ovo je mehanizam za pretvaranje teksta u govor koji koristi Kaldi sljedeće generacije"
|
||||
}
|
||||
|
||||
"hun" -> {
|
||||
text = "Ez egy szövegfelolvasó motor a következő generációs kaldi használatával"
|
||||
}
|
||||
|
||||
"isl" -> {
|
||||
text = "Þetta er texta í tal vél sem notar næstu kynslóð kaldi"
|
||||
}
|
||||
|
||||
"ita" -> {
|
||||
text = "Questo è un motore di sintesi vocale che utilizza kaldi di nuova generazione"
|
||||
}
|
||||
|
||||
"kat" -> {
|
||||
text = "ეს არის ტექსტიდან მეტყველების ძრავა შემდეგი თაობის კალდის გამოყენებით"
|
||||
}
|
||||
|
||||
"kaz" -> {
|
||||
text = "Бұл келесі буын kaldi көмегімен мәтіннен сөйлеуге арналған қозғалтқыш"
|
||||
}
|
||||
|
||||
"mlt" -> {
|
||||
text = "Din hija magna text-to-speech li tuża Kaldi tal-ġenerazzjoni li jmiss"
|
||||
}
|
||||
|
||||
"lav" -> {
|
||||
text = "Šis ir teksta pārvēršanas runā dzinējs, kas izmanto nākamās paaudzes Kaldi"
|
||||
}
|
||||
|
||||
"lit" -> {
|
||||
text = "Tai teksto į kalbą variklis, kuriame naudojamas naujos kartos Kaldi"
|
||||
}
|
||||
|
||||
"ltz" -> {
|
||||
text = "Dëst ass en Text-zu-Speech-Motor mat der nächster Generatioun Kaldi"
|
||||
}
|
||||
|
||||
"nep" -> {
|
||||
text = "यो अर्को पुस्ता काल्डी प्रयोग गरेर स्पीच इन्जिनको पाठ हो"
|
||||
}
|
||||
|
||||
"nld" -> {
|
||||
text = "Dit is een tekst-naar-spraak-engine die gebruik maakt van Kaldi van de volgende generatie"
|
||||
text =
|
||||
"Dit is een tekst-naar-spraak-engine die gebruik maakt van Kaldi van de volgende generatie"
|
||||
}
|
||||
|
||||
"nor" -> {
|
||||
text = "Dette er en tekst til tale-motor som bruker neste generasjons kaldi"
|
||||
}
|
||||
|
||||
"pol" -> {
|
||||
text = "Jest to silnik syntezatora mowy wykorzystujący Kaldi nowej generacji"
|
||||
}
|
||||
|
||||
"por" -> {
|
||||
text = "Este é um mecanismo de conversão de texto em fala usando Kaldi de próxima geração"
|
||||
text =
|
||||
"Este é um mecanismo de conversão de texto em fala usando Kaldi de próxima geração"
|
||||
}
|
||||
|
||||
"ron" -> {
|
||||
text = "Acesta este un motor text to speech care folosește generația următoare de kadi"
|
||||
}
|
||||
|
||||
"rus" -> {
|
||||
text = "Это движок преобразования текста в речь, использующий Kaldi следующего поколения."
|
||||
text =
|
||||
"Это движок преобразования текста в речь, использующий Kaldi следующего поколения."
|
||||
}
|
||||
|
||||
"slk" -> {
|
||||
text = "Toto je nástroj na prevod textu na reč využívajúci kaldi novej generácie"
|
||||
}
|
||||
|
||||
"slv" -> {
|
||||
text = "To je mehanizem za pretvorbo besedila v govor, ki uporablja Kaldi naslednje generacije"
|
||||
text =
|
||||
"To je mehanizem za pretvorbo besedila v govor, ki uporablja Kaldi naslednje generacije"
|
||||
}
|
||||
|
||||
"spa" -> {
|
||||
text = "Este es un motor de texto a voz que utiliza kaldi de próxima generación."
|
||||
}
|
||||
|
||||
"srp" -> {
|
||||
text = "Ово је механизам за претварање текста у говор који користи калди следеће генерације"
|
||||
text =
|
||||
"Ово је механизам за претварање текста у говор који користи калди следеће генерације"
|
||||
}
|
||||
|
||||
"swa" -> {
|
||||
text = "Haya ni maandishi kwa injini ya hotuba kwa kutumia kizazi kijacho kaldi"
|
||||
}
|
||||
|
||||
"swe" -> {
|
||||
text = "Detta är en text till tal-motor som använder nästa generations kaldi"
|
||||
}
|
||||
|
||||
"tur" -> {
|
||||
text = "Bu, yeni nesil kaldi'yi kullanan bir metinden konuşmaya motorudur"
|
||||
}
|
||||
|
||||
"ukr" -> {
|
||||
text = "Це механізм перетворення тексту на мовлення, який використовує kaldi нового покоління"
|
||||
text =
|
||||
"Це механізм перетворення тексту на мовлення, який використовує kaldi нового покоління"
|
||||
}
|
||||
|
||||
"vie" -> {
|
||||
text = "Đây là công cụ chuyển văn bản thành giọng nói sử dụng kaldi thế hệ tiếp theo"
|
||||
}
|
||||
|
||||
"zho", "cmn" -> {
|
||||
text = "使用新一代卡尔迪的语音合成引擎"
|
||||
}
|
||||
@@ -137,13 +184,13 @@ class GetSampleText : Activity() {
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
var result = TextToSpeech.LANG_AVAILABLE
|
||||
var text: String = getSampleText(TtsEngine.lang ?: "")
|
||||
val text: String = getSampleText(TtsEngine.lang ?: "")
|
||||
if (text.isEmpty()) {
|
||||
result = TextToSpeech.LANG_NOT_SUPPORTED
|
||||
}
|
||||
|
||||
val intent = Intent().apply{
|
||||
if(result == TextToSpeech.LANG_AVAILABLE) {
|
||||
val intent = Intent().apply {
|
||||
if (result == TextToSpeech.LANG_AVAILABLE) {
|
||||
putExtra(TextToSpeech.Engine.EXTRA_SAMPLE_TEXT, text)
|
||||
} else {
|
||||
putExtra("sampleText", text)
|
||||
|
||||
@@ -26,20 +26,16 @@ import androidx.compose.material3.Scaffold
|
||||
import androidx.compose.material3.Slider
|
||||
import androidx.compose.material3.Surface
|
||||
import androidx.compose.material3.Text
|
||||
import androidx.compose.material3.TextField
|
||||
import androidx.compose.material3.TopAppBar
|
||||
import androidx.compose.runtime.Composable
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import androidx.compose.runtime.remember
|
||||
import androidx.compose.runtime.setValue
|
||||
import androidx.compose.ui.Modifier
|
||||
import androidx.compose.ui.text.input.KeyboardType
|
||||
import androidx.compose.ui.tooling.preview.Preview
|
||||
import androidx.compose.ui.unit.dp
|
||||
import com.k2fsa.sherpa.onnx.tts.engine.ui.theme.SherpaOnnxTtsEngineTheme
|
||||
import java.io.File
|
||||
import java.lang.NumberFormatException
|
||||
|
||||
const val TAG = "sherpa-onnx-tts-engine"
|
||||
|
||||
@@ -76,7 +72,7 @@ class MainActivity : ComponentActivity() {
|
||||
val testTextContent = getSampleText(TtsEngine.lang ?: "")
|
||||
|
||||
var testText by remember { mutableStateOf(testTextContent) }
|
||||
|
||||
|
||||
val numSpeakers = TtsEngine.tts!!.numSpeakers()
|
||||
if (numSpeakers > 1) {
|
||||
OutlinedTextField(
|
||||
@@ -88,7 +84,7 @@ class MainActivity : ComponentActivity() {
|
||||
try {
|
||||
TtsEngine.speakerId = it.toString().toInt()
|
||||
} catch (ex: NumberFormatException) {
|
||||
Log.i(TAG, "Invalid input: ${it}")
|
||||
Log.i(TAG, "Invalid input: $it")
|
||||
TtsEngine.speakerId = 0
|
||||
}
|
||||
}
|
||||
@@ -119,7 +115,7 @@ class MainActivity : ComponentActivity() {
|
||||
Button(
|
||||
modifier = Modifier.padding(20.dp),
|
||||
onClick = {
|
||||
Log.i(TAG, "Clicked, text: ${testText}")
|
||||
Log.i(TAG, "Clicked, text: $testText")
|
||||
if (testText.isBlank() || testText.isEmpty()) {
|
||||
Toast.makeText(
|
||||
applicationContext,
|
||||
@@ -136,7 +132,7 @@ class MainActivity : ComponentActivity() {
|
||||
val filename =
|
||||
application.filesDir.absolutePath + "/generated.wav"
|
||||
val ok =
|
||||
audio.samples.size > 0 && audio.save(filename)
|
||||
audio.samples.isNotEmpty() && audio.save(filename)
|
||||
|
||||
if (ok) {
|
||||
stopMediaPlayer()
|
||||
|
||||
@@ -4,8 +4,10 @@ import android.content.Context
|
||||
import android.content.res.AssetManager
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.MutableState
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
import androidx.compose.runtime.mutableFloatStateOf
|
||||
import androidx.compose.runtime.mutableIntStateOf
|
||||
import com.k2fsa.sherpa.onnx.OfflineTts
|
||||
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
import java.io.IOException
|
||||
@@ -21,8 +23,8 @@ object TtsEngine {
|
||||
var lang: String? = null
|
||||
|
||||
|
||||
val speedState: MutableState<Float> = mutableStateOf(1.0F)
|
||||
val speakerIdState: MutableState<Int> = mutableStateOf(0)
|
||||
val speedState: MutableState<Float> = mutableFloatStateOf(1.0F)
|
||||
val speakerIdState: MutableState<Int> = mutableIntStateOf(0)
|
||||
|
||||
var speed: Float
|
||||
get() = speedState.value
|
||||
@@ -113,15 +115,15 @@ object TtsEngine {
|
||||
|
||||
if (dataDir != null) {
|
||||
val newDir = copyDataDir(context, modelDir!!)
|
||||
modelDir = newDir + "/" + modelDir
|
||||
dataDir = newDir + "/" + dataDir
|
||||
modelDir = "$newDir/$modelDir"
|
||||
dataDir = "$newDir/$dataDir"
|
||||
assets = null
|
||||
}
|
||||
|
||||
if (dictDir != null) {
|
||||
val newDir = copyDataDir(context, modelDir!!)
|
||||
modelDir = newDir + "/" + modelDir
|
||||
dictDir = modelDir + "/" + "dict"
|
||||
modelDir = "$newDir/$modelDir"
|
||||
dictDir = "$modelDir/dict"
|
||||
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst"
|
||||
assets = null
|
||||
}
|
||||
@@ -132,18 +134,18 @@ object TtsEngine {
|
||||
dictDir = dictDir ?: "",
|
||||
ruleFsts = ruleFsts ?: "",
|
||||
ruleFars = ruleFars ?: ""
|
||||
)!!
|
||||
)
|
||||
|
||||
tts = OfflineTts(assetManager = assets, config = config)
|
||||
}
|
||||
|
||||
|
||||
private fun copyDataDir(context: Context, dataDir: String): String {
|
||||
println("data dir is $dataDir")
|
||||
Log.i(TAG, "data dir is $dataDir")
|
||||
copyAssets(context, dataDir)
|
||||
|
||||
val newDataDir = context.getExternalFilesDir(null)!!.absolutePath
|
||||
println("newDataDir: $newDataDir")
|
||||
Log.i(TAG, "newDataDir: $newDataDir")
|
||||
return newDataDir
|
||||
}
|
||||
|
||||
@@ -158,12 +160,12 @@ object TtsEngine {
|
||||
val dir = File(fullPath)
|
||||
dir.mkdirs()
|
||||
for (asset in assets.iterator()) {
|
||||
val p: String = if (path == "") "" else path + "/"
|
||||
val p: String = if (path == "") "" else "$path/"
|
||||
copyAssets(context, p + asset)
|
||||
}
|
||||
}
|
||||
} catch (ex: IOException) {
|
||||
Log.e(TAG, "Failed to copy $path. ${ex.toString()}")
|
||||
Log.e(TAG, "Failed to copy $path. $ex")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +185,7 @@ object TtsEngine {
|
||||
ostream.flush()
|
||||
ostream.close()
|
||||
} catch (ex: Exception) {
|
||||
Log.e(TAG, "Failed to copy $filename, ${ex.toString()}")
|
||||
Log.e(TAG, "Failed to copy $filename, $ex")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ import android.speech.tts.SynthesisRequest
|
||||
import android.speech.tts.TextToSpeech
|
||||
import android.speech.tts.TextToSpeechService
|
||||
import android.util.Log
|
||||
import com.k2fsa.sherpa.onnx.*
|
||||
|
||||
/*
|
||||
https://developer.android.com/reference/java/util/Locale#getISO3Language()
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package com.k2fsa.sherpa.onnx.tts.engine
|
||||
|
||||
import android.app.Application
|
||||
import android.os.FileUtils.ProgressListener
|
||||
import android.speech.tts.TextToSpeech
|
||||
import android.speech.tts.TextToSpeech.OnInitListener
|
||||
import android.speech.tts.UtteranceProgressListener
|
||||
@@ -27,7 +26,7 @@ class TtsViewModel : ViewModel() {
|
||||
private val onInitListener = object : OnInitListener {
|
||||
override fun onInit(status: Int) {
|
||||
when (status) {
|
||||
TextToSpeech.SUCCESS -> Log.i(TAG, "Init tts succeded")
|
||||
TextToSpeech.SUCCESS -> Log.i(TAG, "Init tts succeeded")
|
||||
TextToSpeech.ERROR -> Log.i(TAG, "Init tts failed")
|
||||
else -> Log.i(TAG, "Unknown status $status")
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
android:theme="@style/Theme.SherpaOnnxVad"
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:name="com.k2fsa.sherpa.onnx.vad.MainActivity"
|
||||
android:exported="true">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
package com.k2fsa.sherpa.onnx.vad
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
@@ -11,6 +11,9 @@ import android.view.View
|
||||
import android.widget.Button
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.R
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
|
||||
@@ -116,7 +119,7 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
private fun initVadModel() {
|
||||
val type = 0
|
||||
println("Select VAD model type ${type}")
|
||||
Log.i(TAG, "Select VAD model type ${type}")
|
||||
val config = getVadModelConfig(type)
|
||||
|
||||
vad = Vad(
|
||||
@@ -171,4 +174,4 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class SileroVadModelConfig(
|
||||
var model: String,
|
||||
var threshold: Float = 0.5F,
|
||||
var minSilenceDuration: Float = 0.25F,
|
||||
var minSpeechDuration: Float = 0.25F,
|
||||
var windowSize: Int = 512,
|
||||
)
|
||||
|
||||
data class VadModelConfig(
|
||||
var sileroVadModelConfig: SileroVadModelConfig,
|
||||
var sampleRate: Int = 16000,
|
||||
var numThreads: Int = 1,
|
||||
var provider: String = "cpu",
|
||||
var debug: Boolean = false,
|
||||
)
|
||||
|
||||
class Vad(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: VadModelConfig,
|
||||
) {
|
||||
private val ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = new(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
delete(ptr)
|
||||
}
|
||||
|
||||
fun acceptWaveform(samples: FloatArray) = acceptWaveform(ptr, samples)
|
||||
|
||||
fun empty(): Boolean = empty(ptr)
|
||||
fun pop() = pop(ptr)
|
||||
|
||||
// return an array containing
|
||||
// [start: Int, samples: FloatArray]
|
||||
fun front() = front(ptr)
|
||||
|
||||
fun clear() = clear(ptr)
|
||||
|
||||
fun isSpeechDetected(): Boolean = isSpeechDetected(ptr)
|
||||
|
||||
fun reset() = reset(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun new(
|
||||
assetManager: AssetManager,
|
||||
config: VadModelConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: VadModelConfig,
|
||||
): Long
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray)
|
||||
private external fun empty(ptr: Long): Boolean
|
||||
private external fun pop(ptr: Long)
|
||||
private external fun clear(ptr: Long)
|
||||
private external fun front(ptr: Long): Array<Any>
|
||||
private external fun isSpeechDetected(ptr: Long): Boolean
|
||||
private external fun reset(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Please visit
|
||||
// https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
|
||||
// to download silero_vad.onnx
|
||||
// and put it inside the assets/
|
||||
// directory
|
||||
fun getVadModelConfig(type: Int): VadModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
return VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 512,
|
||||
),
|
||||
sampleRate = 16000,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/Vad.kt
|
||||
@@ -4,7 +4,7 @@
|
||||
xmlns:tools="http://schemas.android.com/tools"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
tools:context=".MainActivity">
|
||||
tools:context="com.k2fsa.sherpa.onnx.vad.MainActivity">
|
||||
<LinearLayout
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
@@ -40,4 +40,4 @@
|
||||
|
||||
|
||||
|
||||
</androidx.constraintlayout.widget.ConstraintLayout>
|
||||
</androidx.constraintlayout.widget.ConstraintLayout>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
android:theme="@style/Theme.SherpaOnnxVadAsr"
|
||||
tools:targetApi="31">
|
||||
<activity
|
||||
android:name=".MainActivity"
|
||||
android:name=".vad.asr.MainActivity"
|
||||
android:exported="true">
|
||||
<intent-filter>
|
||||
<action android:name="android.intent.action.MAIN" />
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/FeatureConfig.kt
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
package com.k2fsa.sherpa.onnx.vad.asr
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
@@ -13,6 +13,13 @@ import android.widget.Button
|
||||
import android.widget.TextView
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.k2fsa.sherpa.onnx.OfflineRecognizer
|
||||
import com.k2fsa.sherpa.onnx.OfflineRecognizerConfig
|
||||
import com.k2fsa.sherpa.onnx.R
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.getFeatureConfig
|
||||
import com.k2fsa.sherpa.onnx.getOfflineModelConfig
|
||||
import com.k2fsa.sherpa.onnx.getVadModelConfig
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
|
||||
@@ -40,7 +47,7 @@ class MainActivity : AppCompatActivity() {
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
// Non-streaming ASR
|
||||
private lateinit var offlineRecognizer: SherpaOnnxOffline
|
||||
private lateinit var offlineRecognizer: OfflineRecognizer
|
||||
|
||||
private var idx: Int = 0
|
||||
private var lastText: String = ""
|
||||
@@ -122,7 +129,7 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
private fun initVadModel() {
|
||||
val type = 0
|
||||
println("Select VAD model type ${type}")
|
||||
Log.i(TAG, "Select VAD model type ${type}")
|
||||
val config = getVadModelConfig(type)
|
||||
|
||||
vad = Vad(
|
||||
@@ -194,20 +201,25 @@ class MainActivity : AppCompatActivity() {
|
||||
// See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
|
||||
// for a list of available models
|
||||
val secondType = 0
|
||||
println("Select model type ${secondType} for the second pass")
|
||||
Log.i(TAG, "Select model type ${secondType} for the second pass")
|
||||
|
||||
val config = OfflineRecognizerConfig(
|
||||
featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80),
|
||||
modelConfig = getOfflineModelConfig(type = secondType)!!,
|
||||
)
|
||||
|
||||
offlineRecognizer = SherpaOnnxOffline(
|
||||
offlineRecognizer = OfflineRecognizer(
|
||||
assetManager = application.assets,
|
||||
config = config,
|
||||
)
|
||||
}
|
||||
|
||||
private fun runSecondPass(samples: FloatArray): String {
|
||||
return offlineRecognizer.decode(samples, sampleRateInHz)
|
||||
val stream = offlineRecognizer.createStream()
|
||||
stream.acceptWaveform(samples, sampleRateInHz)
|
||||
offlineRecognizer.decode(stream)
|
||||
val result = offlineRecognizer.getResult(stream)
|
||||
stream.release()
|
||||
return result.text
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineRecognizer.kt
|
||||
@@ -0,0 +1 @@
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineStream.kt
|
||||
@@ -1 +0,0 @@
|
||||
../../../../../../../../../SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/SherpaOnnx.kt
|
||||
@@ -1 +1 @@
|
||||
../../../../../../../../../SherpaOnnxVad/app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
|
||||
../../../../../../../../../../sherpa-onnx/kotlin-api/Vad.kt
|
||||
@@ -4,7 +4,7 @@
|
||||
xmlns:tools="http://schemas.android.com/tools"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
tools:context=".MainActivity">
|
||||
tools:context=".vad.asr.MainActivity">
|
||||
|
||||
<LinearLayout
|
||||
android:layout_width="match_parent"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
<resources>
|
||||
<string name="app_name">VAD-ASR</string>
|
||||
<string name="app_name">VAD+ASR</string>
|
||||
<string name="hint">Click the Start button to play speech-to-text with Next-gen Kaldi.
|
||||
\n
|
||||
\n\n\n
|
||||
|
||||
Reference in New Issue
Block a user