Add C++ example for streaming ASR with SenseVoice. (#2199)

This commit is contained in:
Fangjun Kuang
2025-05-11 00:23:32 +08:00
committed by GitHub
parent fc2121c307
commit 028b8f2718
16 changed files with 514 additions and 60 deletions

View File

@@ -678,4 +678,42 @@ void VoiceActivityDetector::Flush() const {
SherpaOnnxVoiceActivityDetectorFlush(p_);
}
LinearResampler LinearResampler::Create(int32_t samp_rate_in_hz,
int32_t samp_rate_out_hz,
float filter_cutoff_hz,
int32_t num_zeros) {
auto p = SherpaOnnxCreateLinearResampler(samp_rate_in_hz, samp_rate_out_hz,
filter_cutoff_hz, num_zeros);
return LinearResampler(p);
}
LinearResampler::LinearResampler(const SherpaOnnxLinearResampler *p)
: MoveOnly<LinearResampler, SherpaOnnxLinearResampler>(p) {}
void LinearResampler::Destroy(const SherpaOnnxLinearResampler *p) const {
SherpaOnnxDestroyLinearResampler(p);
}
void LinearResampler::Reset() const { SherpaOnnxLinearResamplerReset(p_); }
std::vector<float> LinearResampler::Resample(const float *input,
int32_t input_dim,
bool flush) const {
auto out = SherpaOnnxLinearResamplerResample(p_, input, input_dim, flush);
std::vector<float> ans{out->samples, out->samples + out->n};
SherpaOnnxLinearResamplerResampleFree(out);
return ans;
}
int32_t LinearResampler::GetInputSamplingRate() const {
return SherpaOnnxLinearResamplerResampleGetInputSampleRate(p_);
}
int32_t LinearResampler::GetOutputSamplingRate() const {
return SherpaOnnxLinearResamplerResampleGetOutputSampleRate(p_);
}
} // namespace sherpa_onnx::cxx

View File

@@ -111,6 +111,7 @@ SHERPA_ONNX_API bool WriteWave(const std::string &filename, const Wave &wave);
template <typename Derived, typename T>
class SHERPA_ONNX_API MoveOnly {
public:
MoveOnly() = default;
explicit MoveOnly(const T *p) : p_(p) {}
~MoveOnly() { Destroy(); }
@@ -591,6 +592,28 @@ class SHERPA_ONNX_API VoiceActivityDetector
explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
};
class SHERPA_ONNX_API LinearResampler
: public MoveOnly<LinearResampler, SherpaOnnxLinearResampler> {
public:
LinearResampler() = default;
static LinearResampler Create(int32_t samp_rate_in_hz,
int32_t samp_rate_out_hz,
float filter_cutoff_hz, int32_t num_zeros);
void Destroy(const SherpaOnnxLinearResampler *p) const;
void Reset() const;
std::vector<float> Resample(const float *input, int32_t input_dim,
bool flush) const;
int32_t GetInputSamplingRate() const;
int32_t GetOutputSamplingRate() const;
private:
explicit LinearResampler(const SherpaOnnxLinearResampler *p);
};
} // namespace sherpa_onnx::cxx
#endif // SHERPA_ONNX_C_API_CXX_API_H_

View File

@@ -166,20 +166,32 @@ class HomophoneReplacer::Impl {
}
// convert words to pronunciations
std::vector<std::string> pronunciations;
std::vector<std::string> current_words;
std::vector<std::string> current_pronunciations;
for (const auto &w : words) {
if (w.size() < 3 ||
reinterpret_cast<const uint8_t *>(w.data())[0] < 128) {
if (!current_words.empty()) {
ans += ApplyImpl(current_words, current_pronunciations);
current_words.clear();
current_pronunciations.clear();
}
ans += w;
continue;
}
auto p = ConvertWordToPronunciation(w);
if (config_.debug) {
SHERPA_ONNX_LOGE("%s %s", w.c_str(), p.c_str());
}
pronunciations.push_back(std::move(p));
current_words.push_back(w);
current_pronunciations.push_back(std::move(p));
}
for (const auto &r : replacer_list_) {
ans = r->Normalize(words, pronunciations);
// TODO(fangjun): We support only 1 rule fst at present.
break;
if (!current_words.empty()) {
ans += ApplyImpl(current_words, current_pronunciations);
}
if (config_.debug) {
@@ -190,6 +202,16 @@ class HomophoneReplacer::Impl {
}
private:
std::string ApplyImpl(const std::vector<std::string> &words,
const std::vector<std::string> &pronunciations) const {
std::string ans;
for (const auto &r : replacer_list_) {
ans = r->Normalize(words, pronunciations);
// TODO(fangjun): We support only 1 rule fst at present.
break;
}
return ans;
}
std::string ConvertWordToPronunciation(const std::string &word) const {
if (word2pron_.count(word)) {
return word2pron_.at(word);
@@ -239,6 +261,9 @@ class HomophoneReplacer::Impl {
}
while (iss >> p) {
if (p.back() > '4') {
p.push_back('1');
}
pron.append(std::move(p));
}