Add emotion, event of SenseVoice. (#1257)
* Add emotion, event of SenseVoice. * Fix tokens size check and update java api. https://github.com/k2-fsa/sherpa-onnx/pull/1257
This commit is contained in:
@@ -531,6 +531,20 @@ const SherpaOnnxOfflineRecognizerResult *SherpaOnnxGetOfflineStreamResult(
|
|||||||
c_lang[lang.size()] = '\0';
|
c_lang[lang.size()] = '\0';
|
||||||
r->lang = c_lang;
|
r->lang = c_lang;
|
||||||
|
|
||||||
|
// emotion
|
||||||
|
const auto &emotion = result.emotion;
|
||||||
|
char *c_emotion = new char[emotion.size() + 1];
|
||||||
|
std::copy(emotion.begin(), emotion.end(), c_emotion);
|
||||||
|
c_emotion[emotion.size()] = '\0';
|
||||||
|
r->emotion = c_emotion;
|
||||||
|
|
||||||
|
// event
|
||||||
|
const auto &event = result.event;
|
||||||
|
char *c_event = new char[event.size() + 1];
|
||||||
|
std::copy(event.begin(), event.end(), c_event);
|
||||||
|
c_event[event.size()] = '\0';
|
||||||
|
r->event = c_event;
|
||||||
|
|
||||||
// copy json
|
// copy json
|
||||||
std::string json = result.AsJsonString();
|
std::string json = result.AsJsonString();
|
||||||
char *pJson = new char[json.size() + 1];
|
char *pJson = new char[json.size() + 1];
|
||||||
@@ -588,6 +602,8 @@ void SherpaOnnxDestroyOfflineRecognizerResult(
|
|||||||
delete[] r->tokens_arr;
|
delete[] r->tokens_arr;
|
||||||
delete[] r->json;
|
delete[] r->json;
|
||||||
delete[] r->lang;
|
delete[] r->lang;
|
||||||
|
delete[] r->emotion;
|
||||||
|
delete[] r->event;
|
||||||
delete r;
|
delete r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -544,6 +544,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerResult {
|
|||||||
|
|
||||||
// return recognized language
|
// return recognized language
|
||||||
const char *lang;
|
const char *lang;
|
||||||
|
|
||||||
|
// return emotion.
|
||||||
|
const char *emotion;
|
||||||
|
|
||||||
|
// return event.
|
||||||
|
const char *event;
|
||||||
} SherpaOnnxOfflineRecognizerResult;
|
} SherpaOnnxOfflineRecognizerResult;
|
||||||
|
|
||||||
/// Get the result of the offline stream.
|
/// Get the result of the offline stream.
|
||||||
|
|||||||
@@ -52,6 +52,13 @@ static OfflineRecognitionResult ConvertSenseVoiceResult(
|
|||||||
|
|
||||||
r.words = std::move(src.words);
|
r.words = std::move(src.words);
|
||||||
|
|
||||||
|
// parse lang, emotion and event from tokens.
|
||||||
|
if (src.tokens.size() >= 3) {
|
||||||
|
r.lang = sym_table[src.tokens[0]];
|
||||||
|
r.emotion = sym_table[src.tokens[1]];
|
||||||
|
r.event = sym_table[src.tokens[2]];
|
||||||
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -304,6 +304,19 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
|
|||||||
std::string OfflineRecognitionResult::AsJsonString() const {
|
std::string OfflineRecognitionResult::AsJsonString() const {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
os << "{";
|
os << "{";
|
||||||
|
|
||||||
|
os << "\"lang\""
|
||||||
|
<< ": ";
|
||||||
|
os << std::quoted(lang) << ", ";
|
||||||
|
|
||||||
|
os << "\"emotion\""
|
||||||
|
<< ": ";
|
||||||
|
os << std::quoted(emotion) << ", ";
|
||||||
|
|
||||||
|
os << "\"event\""
|
||||||
|
<< ": ";
|
||||||
|
os << std::quoted(event) << ", ";
|
||||||
|
|
||||||
os << "\"text\""
|
os << "\"text\""
|
||||||
<< ": ";
|
<< ": ";
|
||||||
os << std::quoted(text) << ", ";
|
os << std::quoted(text) << ", ";
|
||||||
|
|||||||
@@ -28,6 +28,12 @@ struct OfflineRecognitionResult {
|
|||||||
|
|
||||||
std::string lang;
|
std::string lang;
|
||||||
|
|
||||||
|
// emotion target of the audio.
|
||||||
|
std::string emotion;
|
||||||
|
|
||||||
|
// event target of the audio.
|
||||||
|
std::string event;
|
||||||
|
|
||||||
/// timestamps.size() == tokens.size()
|
/// timestamps.size() == tokens.size()
|
||||||
/// timestamps[i] records the time in seconds when tokens[i] is decoded.
|
/// timestamps[i] records the time in seconds when tokens[i] is decoded.
|
||||||
std::vector<float> timestamps;
|
std::vector<float> timestamps;
|
||||||
|
|||||||
@@ -41,7 +41,10 @@ public class OfflineRecognizer {
|
|||||||
String text = (String) arr[0];
|
String text = (String) arr[0];
|
||||||
String[] tokens = (String[]) arr[1];
|
String[] tokens = (String[]) arr[1];
|
||||||
float[] timestamps = (float[]) arr[2];
|
float[] timestamps = (float[]) arr[2];
|
||||||
return new OfflineRecognizerResult(text, tokens, timestamps);
|
String lang = (String) arr[3];
|
||||||
|
String emotion = (String) arr[4];
|
||||||
|
String event = (String) arr[5];
|
||||||
|
return new OfflineRecognizerResult(text, tokens, timestamps, lang, emotion, event);
|
||||||
}
|
}
|
||||||
|
|
||||||
private native void delete(long ptr);
|
private native void delete(long ptr);
|
||||||
|
|||||||
@@ -6,11 +6,17 @@ public class OfflineRecognizerResult {
|
|||||||
private final String text;
|
private final String text;
|
||||||
private final String[] tokens;
|
private final String[] tokens;
|
||||||
private final float[] timestamps;
|
private final float[] timestamps;
|
||||||
|
private final String lang;
|
||||||
|
private final String emotion;
|
||||||
|
private final String event;
|
||||||
|
|
||||||
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps) {
|
public OfflineRecognizerResult(String text, String[] tokens, float[] timestamps, String lang, String emotion, String event) {
|
||||||
this.text = text;
|
this.text = text;
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
this.timestamps = timestamps;
|
this.timestamps = timestamps;
|
||||||
|
this.lang = lang;
|
||||||
|
this.emotion = emotion;
|
||||||
|
this.event = event;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
@@ -24,4 +30,16 @@ public class OfflineRecognizerResult {
|
|||||||
public float[] getTimestamps() {
|
public float[] getTimestamps() {
|
||||||
return timestamps;
|
return timestamps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getLang() {
|
||||||
|
return lang;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEmotion() {
|
||||||
|
return emotion;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEvent() {
|
||||||
|
return event;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -320,8 +320,11 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
|
|||||||
// [0]: text, jstring
|
// [0]: text, jstring
|
||||||
// [1]: tokens, array of jstring
|
// [1]: tokens, array of jstring
|
||||||
// [2]: timestamps, array of float
|
// [2]: timestamps, array of float
|
||||||
|
// [3]: lang, jstring
|
||||||
|
// [4]: emotion, jstring
|
||||||
|
// [5]: event, jstring
|
||||||
jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
|
jobjectArray obj_arr = (jobjectArray)env->NewObjectArray(
|
||||||
3, env->FindClass("java/lang/Object"), nullptr);
|
6, env->FindClass("java/lang/Object"), nullptr);
|
||||||
|
|
||||||
jstring text = env->NewStringUTF(result.text.c_str());
|
jstring text = env->NewStringUTF(result.text.c_str());
|
||||||
env->SetObjectArrayElement(obj_arr, 0, text);
|
env->SetObjectArrayElement(obj_arr, 0, text);
|
||||||
@@ -344,5 +347,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
|
|||||||
|
|
||||||
env->SetObjectArrayElement(obj_arr, 2, timestamps_arr);
|
env->SetObjectArrayElement(obj_arr, 2, timestamps_arr);
|
||||||
|
|
||||||
|
// [3]: lang, jstring
|
||||||
|
// [4]: emotion, jstring
|
||||||
|
// [5]: event, jstring
|
||||||
|
env->SetObjectArrayElement(obj_arr, 3, env->NewStringUTF(result.lang.c_str()));
|
||||||
|
env->SetObjectArrayElement(obj_arr, 4, env->NewStringUTF(result.emotion.c_str()));
|
||||||
|
env->SetObjectArrayElement(obj_arr, 5, env->NewStringUTF(result.event.c_str()));
|
||||||
|
|
||||||
return obj_arr;
|
return obj_arr;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ data class OfflineRecognizerResult(
|
|||||||
val text: String,
|
val text: String,
|
||||||
val tokens: Array<String>,
|
val tokens: Array<String>,
|
||||||
val timestamps: FloatArray,
|
val timestamps: FloatArray,
|
||||||
|
val lang: String,
|
||||||
|
val emotion: String,
|
||||||
|
val event: String,
|
||||||
)
|
)
|
||||||
|
|
||||||
data class OfflineTransducerModelConfig(
|
data class OfflineTransducerModelConfig(
|
||||||
@@ -96,7 +99,10 @@ class OfflineRecognizer(
|
|||||||
val text = objArray[0] as String
|
val text = objArray[0] as String
|
||||||
val tokens = objArray[1] as Array<String>
|
val tokens = objArray[1] as Array<String>
|
||||||
val timestamps = objArray[2] as FloatArray
|
val timestamps = objArray[2] as FloatArray
|
||||||
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps)
|
val lang = objArray[3] as String
|
||||||
|
val emotion = objArray[4] as String
|
||||||
|
val event = objArray[5] as String
|
||||||
|
return OfflineRecognizerResult(text = text, tokens = tokens, timestamps = timestamps, lang = lang, emotion = emotion, event = event)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun decode(stream: OfflineStream) = decode(ptr, stream.ptr)
|
fun decode(stream: OfflineStream) = decode(ptr, stream.ptr)
|
||||||
|
|||||||
@@ -32,6 +32,12 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
|
|||||||
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
|
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
|
||||||
self.text.size(), "ignore"));
|
self.text.size(), "ignore"));
|
||||||
})
|
})
|
||||||
|
.def_property_readonly("lang",
|
||||||
|
[](const PyClass &self) { return self.lang; })
|
||||||
|
.def_property_readonly("emotion",
|
||||||
|
[](const PyClass &self) { return self.emotion; })
|
||||||
|
.def_property_readonly("event",
|
||||||
|
[](const PyClass &self) { return self.event; })
|
||||||
.def_property_readonly("tokens",
|
.def_property_readonly("tokens",
|
||||||
[](const PyClass &self) { return self.tokens; })
|
[](const PyClass &self) { return self.tokens; })
|
||||||
.def_property_readonly("words",
|
.def_property_readonly("words",
|
||||||
|
|||||||
Reference in New Issue
Block a user