Add Android demo for speaker recognition (#536)

See pre-built Android APKs at 
https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html
This commit is contained in:
Fangjun Kuang
2024-01-23 16:50:52 +08:00
committed by GitHub
parent 626775e5e2
commit bbd7c7fc18
73 changed files with 3022 additions and 6 deletions

2
sherpa-onnx/csrc/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*.cc-bak
*.h-bak

View File

@@ -22,6 +22,12 @@ class SpeakerEmbeddingExtractorGeneralImpl
const SpeakerEmbeddingExtractorConfig &config)
: model_(config) {}
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorGeneralImpl(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: model_(mgr, config) {}
#endif
int32_t Dim() const override { return model_.GetMetaData().output_dim; }
std::unique_ptr<OnlineStream> CreateStream() const override {

View File

@@ -90,4 +90,35 @@ SpeakerEmbeddingExtractorImpl::Create(
return nullptr;
}
#if __ANDROID_API__ >= 9
std::unique_ptr<SpeakerEmbeddingExtractorImpl>
SpeakerEmbeddingExtractorImpl::Create(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config) {
ModelType model_type = ModelType::kUnkown;
{
auto buffer = ReadFile(mgr, config.model);
model_type = GetModelType(buffer.data(), buffer.size(), config.debug);
}
switch (model_type) {
case ModelType::kWeSpeaker:
// fall through
case ModelType::k3dSpeaker:
return std::make_unique<SpeakerEmbeddingExtractorGeneralImpl>(mgr,
config);
case ModelType::kNeMo:
return std::make_unique<SpeakerEmbeddingExtractorNeMoImpl>(mgr, config);
case ModelType::kUnkown:
SHERPA_ONNX_LOGE(
"Unknown model type in for speaker embedding extractor!");
return nullptr;
}
// unreachable code
return nullptr;
}
#endif
} // namespace sherpa_onnx

View File

@@ -9,6 +9,11 @@
#include <string>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
namespace sherpa_onnx {
@@ -20,6 +25,11 @@ class SpeakerEmbeddingExtractorImpl {
static std::unique_ptr<SpeakerEmbeddingExtractorImpl> Create(
const SpeakerEmbeddingExtractorConfig &config);
#if __ANDROID_API__ >= 9
static std::unique_ptr<SpeakerEmbeddingExtractorImpl> Create(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config);
#endif
virtual int32_t Dim() const = 0;
virtual std::unique_ptr<OnlineStream> CreateStream() const = 0;

View File

@@ -28,6 +28,19 @@ class SpeakerEmbeddingExtractorModel::Impl {
}
}
#if __ANDROID_API__ >= 9
Impl(AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_ERROR),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
{
auto buf = ReadFile(mgr, config.model);
Init(buf.data(), buf.size());
}
}
#endif
Ort::Value Compute(Ort::Value x) const {
std::array<Ort::Value, 1> inputs = {std::move(x)};
@@ -98,6 +111,12 @@ SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel(
const SpeakerEmbeddingExtractorConfig &config)
: impl_(std::make_unique<Impl>(config)) {}
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorModel::SpeakerEmbeddingExtractorModel(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: impl_(std::make_unique<Impl>(mgr, config)) {}
#endif
SpeakerEmbeddingExtractorModel::~SpeakerEmbeddingExtractorModel() = default;
const SpeakerEmbeddingExtractorModelMetaData &

View File

@@ -6,6 +6,11 @@
#include <memory>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/speaker-embedding-extractor-model-meta-data.h"
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
@@ -17,6 +22,11 @@ class SpeakerEmbeddingExtractorModel {
explicit SpeakerEmbeddingExtractorModel(
const SpeakerEmbeddingExtractorConfig &config);
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorModel(AAssetManager *mgr,
const SpeakerEmbeddingExtractorConfig &config);
#endif
~SpeakerEmbeddingExtractorModel();
const SpeakerEmbeddingExtractorModelMetaData &GetMetaData() const;

View File

@@ -22,6 +22,12 @@ class SpeakerEmbeddingExtractorNeMoImpl : public SpeakerEmbeddingExtractorImpl {
const SpeakerEmbeddingExtractorConfig &config)
: model_(config) {}
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorNeMoImpl(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: model_(mgr, config) {}
#endif
int32_t Dim() const override { return model_.GetMetaData().output_dim; }
std::unique_ptr<OnlineStream> CreateStream() const override {

View File

@@ -28,6 +28,19 @@ class SpeakerEmbeddingExtractorNeMoModel::Impl {
}
}
#if __ANDROID_API__ >= 9
Impl(AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_ERROR),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
{
auto buf = ReadFile(mgr, config.model);
Init(buf.data(), buf.size());
}
}
#endif
Ort::Value Compute(Ort::Value x, Ort::Value x_lens) const {
std::array<Ort::Value, 2> inputs = {std::move(x), std::move(x_lens)};
@@ -106,6 +119,12 @@ SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel(
const SpeakerEmbeddingExtractorConfig &config)
: impl_(std::make_unique<Impl>(config)) {}
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorNeMoModel::SpeakerEmbeddingExtractorNeMoModel(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: impl_(std::make_unique<Impl>(mgr, config)) {}
#endif
SpeakerEmbeddingExtractorNeMoModel::~SpeakerEmbeddingExtractorNeMoModel() =
default;

View File

@@ -6,6 +6,11 @@
#include <memory>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/speaker-embedding-extractor-nemo-model-meta-data.h"
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
@@ -17,6 +22,11 @@ class SpeakerEmbeddingExtractorNeMoModel {
explicit SpeakerEmbeddingExtractorNeMoModel(
const SpeakerEmbeddingExtractorConfig &config);
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractorNeMoModel(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config);
#endif
~SpeakerEmbeddingExtractorNeMoModel();
const SpeakerEmbeddingExtractorNeMoModelMetaData &GetMetaData() const;

View File

@@ -55,6 +55,12 @@ SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor(
const SpeakerEmbeddingExtractorConfig &config)
: impl_(SpeakerEmbeddingExtractorImpl::Create(config)) {}
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractor::SpeakerEmbeddingExtractor(
AAssetManager *mgr, const SpeakerEmbeddingExtractorConfig &config)
: impl_(SpeakerEmbeddingExtractorImpl::Create(mgr, config)) {}
#endif
SpeakerEmbeddingExtractor::~SpeakerEmbeddingExtractor() = default;
int32_t SpeakerEmbeddingExtractor::Dim() const { return impl_->Dim(); }

View File

@@ -9,6 +9,11 @@
#include <string>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/online-stream.h"
#include "sherpa-onnx/csrc/parse-options.h"
@@ -40,6 +45,11 @@ class SpeakerEmbeddingExtractor {
explicit SpeakerEmbeddingExtractor(
const SpeakerEmbeddingExtractorConfig &config);
#if __ANDROID_API__ >= 9
SpeakerEmbeddingExtractor(AAssetManager *mgr,
const SpeakerEmbeddingExtractorConfig &config);
#endif
~SpeakerEmbeddingExtractor();
// Return the dimension of the embedding

View File

@@ -8,6 +8,7 @@
#include <unordered_map>
#include "Eigen/Dense"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
@@ -36,6 +37,52 @@ class SpeakerEmbeddingManager::Impl {
return true;
}
bool Add(const std::string &name,
const std::vector<std::vector<float>> &embedding_list) {
if (name2row_.count(name)) {
// a speaker with the same name already exists
return false;
}
if (embedding_list.empty()) {
SHERPA_ONNX_LOGE("Empty list of embeddings");
return false;
}
for (const auto &x : embedding_list) {
if (x.size() != dim_) {
SHERPA_ONNX_LOGE("Given dim: %d, expected dim: %d",
static_cast<int32_t>(x.size()), dim_);
return false;
}
}
// compute the average
Eigen::RowVectorXf v = Eigen::Map<Eigen::RowVectorXf>(
const_cast<float *>(embedding_list[0].data()), dim_);
int32_t i = -1;
for (const auto &x : embedding_list) {
++i;
if (i == 0) {
continue;
}
v += Eigen::Map<Eigen::RowVectorXf>(const_cast<float *>(x.data()), dim_);
}
// no need to compute the mean since we are going to normalize it anyway
// v /= embedding_list.size();
v.normalize();
embedding_matrix_.conservativeResize(embedding_matrix_.rows() + 1, dim_);
embedding_matrix_.bottomRows(1) = v;
name2row_[name] = embedding_matrix_.rows() - 1;
row2name_[embedding_matrix_.rows() - 1] = name;
return true;
}
bool Remove(const std::string &name) {
if (!name2row_.count(name)) {
return false;
@@ -104,8 +151,24 @@ class SpeakerEmbeddingManager::Impl {
return true;
}
bool Contains(const std::string &name) const {
return name2row_.count(name) > 0;
}
int32_t NumSpeakers() const { return embedding_matrix_.rows(); }
int32_t Dim() const { return dim_; }
std::vector<std::string> GetAllSpeakers() const {
std::vector<std::string> all_speakers;
for (const auto &p : name2row_) {
all_speakers.push_back(p.first);
}
std::stable_sort(all_speakers.begin(), all_speakers.end());
return all_speakers;
}
private:
int32_t dim_;
FloatMatrix embedding_matrix_;
@@ -123,6 +186,12 @@ bool SpeakerEmbeddingManager::Add(const std::string &name,
return impl_->Add(name, p);
}
bool SpeakerEmbeddingManager::Add(
const std::string &name,
const std::vector<std::vector<float>> &embedding_list) const {
return impl_->Add(name, embedding_list);
}
bool SpeakerEmbeddingManager::Remove(const std::string &name) const {
return impl_->Remove(name);
}
@@ -141,4 +210,14 @@ int32_t SpeakerEmbeddingManager::NumSpeakers() const {
return impl_->NumSpeakers();
}
int32_t SpeakerEmbeddingManager::Dim() const { return impl_->Dim(); }
bool SpeakerEmbeddingManager::Contains(const std::string &name) const {
return impl_->Contains(name);
}
std::vector<std::string> SpeakerEmbeddingManager::GetAllSpeakers() const {
return impl_->GetAllSpeakers();
}
} // namespace sherpa_onnx

View File

@@ -7,6 +7,7 @@
#include <memory>
#include <string>
#include <vector>
namespace sherpa_onnx {
@@ -26,6 +27,19 @@ class SpeakerEmbeddingManager {
*/
bool Add(const std::string &name, const float *p) const;
/** Add a list of embeddings of a speaker.
*
* @param name Name of the speaker
* @param embedding_list A list of embeddings. Each entry should be of size
* `dim`. The average of the list is the final
* embedding.
* @return Return true if added successfully. Return false if it failed.
* At present, the only reason for a failure is that there is already
* a speaker with the same `name`.
*/
bool Add(const std::string &name,
const std::vector<std::vector<float>> &embedding_list) const;
/* Remove a speaker by its name.
*
* @param name Name of the speaker to remove.
@@ -60,8 +74,16 @@ class SpeakerEmbeddingManager {
*/
bool Verify(const std::string &name, const float *p, float threshold) const;
// Return true if the given speaker already exists; return false otherwise.
bool Contains(const std::string &name) const;
int32_t NumSpeakers() const;
int32_t Dim() const;
// Return a list of speaker names
std::vector<std::string> GetAllSpeakers() const;
private:
class Impl;
std::unique_ptr<Impl> impl_;