Add emotion, event of SenseVoice. (#1257)
* Add emotion, event of SenseVoice. * Fix tokens size check and update java api. https://github.com/k2-fsa/sherpa-onnx/pull/1257
This commit is contained in:
@@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult(
|
||||
c_lang[lang.size()] = '\0';
|
||||
r->lang = c_lang;
|
||||
|
||||
// emotion
|
||||
const auto &emotion = result.emotion;
|
||||
char *c_emotion = new char[emotion.size() + 1];
|
||||
std::copy(emotion.begin(), emotion.end(), c_emotion);
|
||||
c_emotion[emotion.size()] = '\0';
|
||||
r->emotion = c_emotion;
|
||||
|
||||
// event
|
||||
const auto &event = result.event;
|
||||
char *c_event = new char[event.size() + 1];
|
||||
std::copy(event.begin(), event.end(), c_event);
|
||||
c_event[event.size()] = '\0';
|
||||
r->event = c_event;
|
||||
|
||||
// copy json
|
||||
std::string json = result.AsJsonString();
|
||||
char *pJson = new char[json.size() + 1];
|
||||
@@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult(
|
||||
delete[] r->tokens_arr;
|
||||
delete[] r->json;
|
||||
delete[] r->lang;
|
||||
delete[] r->emotion;
|
||||
delete[] r->event;
|
||||
delete r;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
|
||||
|
||||
// return recognized language
|
||||
const char *lang;
|
||||
|
||||
// return emotion.
|
||||
const char *emotion;
|
||||
|
||||
// return event.
|
||||
const char *event;
|
||||
} SherpaOnnxOfflineRecognizerResult;
|
||||
|
||||
/// Get the result of the offline stream.
|
||||
|
||||
@@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult(
|
||||
|
||||
r.words = std::move(src.words);
|
||||
|
||||
// parse lang, emotion and event from tokens.
|
||||
if (src.tokens.size() >= 3) {
|
||||
r.lang = sym_table[src.tokens[0]];
|
||||
r.emotion = sym_table[src.tokens[1]];
|
||||
r.event = sym_table[src.tokens[2]];
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
@@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
|
||||
std::string OfflineRecognitionResult::AsJsonString() const {
|
||||
std::ostringstream os;
|
||||
os << "{";
|
||||
|
||||
os << "\"lang\""
|
||||
<< ": ";
|
||||
os << std::quoted(lang) << ", ";
|
||||
|
||||
os << "\"emotion\""
|
||||
<< ": ";
|
||||
os << std::quoted(emotion) << ", ";
|
||||
|
||||
os << "\"event\""
|
||||
<< ": ";
|
||||
os << std::quoted(event) << ", ";
|
||||
|
||||
os << "\"text\""
|
||||
<< ": ";
|
||||
os << std::quoted(text) << ", ";
|
||||
|
||||
@@ -28,6 +28,12 @@ struct OfflineRecognitionResult {
|
||||
|
||||
std::string lang;
|
||||
|
||||
// emotion target of the audio.
|
||||
std::string emotion;
|
||||
|
||||
// event target of the audio.
|
||||
std::string event;
|
||||
|
||||
/// timestamps.size() == tokens.size()
|
||||
/// timestamps[i] records the time in seconds when tokens[i] is decoded.
|
||||
std::vector<float> timestamps;
|
||||
|
||||
@@ -41,7 +41,10 @@ public class OfflineRecognizer {
|
||||
String text = (String) arr[0];
|
||||
String[] tokens = (String[]) arr[1];
|
||||
float[] timestamps = (float[]) arr[2];
|
||||
return new OfflineRecognizerResult(text, tokens, timestamps);
|
||||
String lang = (String) arr[3];
|
||||
String emotion = (String) arr[4];
|
||||
String event = (String) arr[5];
|
||||
return new OfflineRecognizerResult(text, tokens, timestamps, lang, emotion, event);
|
||||
}
|
||||
|
||||
private native void delete(long ptr);
|
||||
|
||||
@@ -6,11 +6,17 @@ public class OfflineRecognizerResult {
|
||||
private final String text;
|
||||
private final String[] tokens;
|
||||
private final float[] timestamps;
|
||||
private final String lang;
|
||||
private final String emotion;
|
||||
private final String event;
|
||||
|
||||
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps) {
|
||||
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps, String lang, String emotion, String event) {
|
||||
this.text = text;
|
||||
this.tokens = tokens;
|
||||
this.timestamps = timestamps;
|
||||
this.lang = lang;
|
||||
this.emotion = emotion;
|
||||
this.event = event;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
@@ -24,4 +30,16 @@ public class OfflineRecognizerResult {
|
||||
public float[] getTimestamps() {
|
||||
return timestamps;
|
||||
}
|
||||
|
||||
public String getLang() {
|
||||
return lang;
|
||||
}
|
||||
|
||||
public String getEmotion() {
|
||||
return emotion;
|
||||
}
|
||||
|
||||
public String getEvent() {
|
||||
return event;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
|
||||
// [0]: text, jstring
|
||||
// [1]: tokens, array of jstring
|
||||
// [2]: timestamps, array of float
|
||||
// [3]: lang, jstring
|
||||
// [4]: emotion, jstring
|
||||
// [5]: event, jstring
|
||||
jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
|
||||
3, env->FindClass("java/lang/Object"), nullptr);
|
||||
6, env->FindClass("java/lang/Object"), nullptr);
|
||||
|
||||
jstring text = env->NewStringUTF(result.text.c_str());
|
||||
env->SetObjectArrayElement(obj_arr, 0, text);
|
||||
@@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
|
||||
|
||||
env->SetObjectArrayElement(obj_arr, 2, timestamps_arr);
|
||||
|
||||
// [3]: lang, jstring
|
||||
// [4]: emotion, jstring
|
||||
// [5]: event, jstring
|
||||
env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str()));
|
||||
env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str()));
|
||||
env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str()));
|
||||
|
||||
return obj_arr;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@ data class OfflineRecognizerResult(
|
||||
val text: String,
|
||||
val tokens: Array<String>,
|
||||
val timestamps: FloatArray,
|
||||
val lang: String,
|
||||
val emotion: String,
|
||||
val event: String,
|
||||
)
|
||||
|
||||
data class OfflineTransducerModelConfig(
|
||||
@@ -96,7 +99,10 @@ class OfflineRecognizer(
|
||||
val text = objArray[0] as String
|
||||
val tokens = objArray[1] as Array<String>
|
||||
val timestamps = objArray[2] as FloatArray
|
||||
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps)
|
||||
val lang = objArray[3] as String
|
||||
val emotion = objArray[4] as String
|
||||
val event = objArray[5] as String
|
||||
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps, lang = lang, emotion = emotion, event = event)
|
||||
}
|
||||
|
||||
fun decode(stream: OfflineStream) = decode(ptr, stream.ptr)
|
||||
|
||||
@@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
|
||||
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
|
||||
self.text.size(), "ignore"));
|
||||
})
|
||||
.def_property_readonly("lang",
|
||||
[](const PyClass &self) { return self.lang; })
|
||||
.def_property_readonly("emotion",
|
||||
[](const PyClass &self) { return self.emotion; })
|
||||
.def_property_readonly("event",
|
||||
[](const PyClass &self) { return self.event; })
|
||||
.def_property_readonly("tokens",
|
||||
[](const PyClass &self) { return self.tokens; })
|
||||
.def_property_readonly("words",
|
||||
|
||||
Reference in New Issue
Block a user