Add microphone streaming ASR example for C API (#650)
This commit is contained in:
9
c-api-examples/asr-microphone-example/CMakeLists.txt
Normal file
9
c-api-examples/asr-microphone-example/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
|
||||
add_executable(c-api-alsa c-api-alsa.cc alsa.cc)
|
||||
target_link_libraries(c-api-alsa sherpa-onnx-c-api cargs)
|
||||
|
||||
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
|
||||
target_link_libraries(c-api-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
|
||||
else()
|
||||
target_link_libraries(c-api-alsa asound)
|
||||
endif()
|
||||
1
c-api-examples/asr-microphone-example/CPPLINT.cfg
Normal file
1
c-api-examples/asr-microphone-example/CPPLINT.cfg
Normal file
@@ -0,0 +1 @@
|
||||
exclude_files=alsa.cc|alsa.h
|
||||
12
c-api-examples/asr-microphone-example/README.md
Normal file
12
c-api-examples/asr-microphone-example/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Introduction
|
||||
|
||||
This folder contains examples for real-time speech recognition from a microphone
|
||||
using sherpa-onnx C API.
|
||||
|
||||
**Note**: You can call C API from C++ files.
|
||||
|
||||
|
||||
## ./c-api-alsa.cc
|
||||
|
||||
This file uses alsa to read a microphone. It runs only on Linux. This file
|
||||
does not support macOS or Windows.
|
||||
1
c-api-examples/asr-microphone-example/alsa.cc
Symbolic link
1
c-api-examples/asr-microphone-example/alsa.cc
Symbolic link
@@ -0,0 +1 @@
|
||||
../../sherpa-onnx/csrc/alsa.cc
|
||||
1
c-api-examples/asr-microphone-example/alsa.h
Symbolic link
1
c-api-examples/asr-microphone-example/alsa.h
Symbolic link
@@ -0,0 +1 @@
|
||||
../../sherpa-onnx/csrc/alsa.h
|
||||
254
c-api-examples/asr-microphone-example/c-api-alsa.cc
Normal file
254
c-api-examples/asr-microphone-example/c-api-alsa.cc
Normal file
@@ -0,0 +1,254 @@
|
||||
// c-api-examples/asr-microphone-example/c-api-alsa.cc
|
||||
// Copyright (c) 2022-2024 Xiaomi Corporation
|
||||
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype> // std::tolower
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "c-api-examples/asr-microphone-example/alsa.h"
|
||||
|
||||
// NOTE: You don't need to use cargs.h in your own project.
|
||||
// We use it in this file to parse commandline arguments
|
||||
#include "cargs.h" // NOLINT
|
||||
#include "sherpa-onnx/c-api/c-api.h"
|
||||
|
||||
static struct cag_option options[] = {
|
||||
{.identifier = 'h',
|
||||
.access_letters = "h",
|
||||
.access_name = "help",
|
||||
.description = "Show help"},
|
||||
{.identifier = 't',
|
||||
.access_letters = NULL,
|
||||
.access_name = "tokens",
|
||||
.value_name = "tokens",
|
||||
.description = "Tokens file"},
|
||||
{.identifier = 'e',
|
||||
.access_letters = NULL,
|
||||
.access_name = "encoder",
|
||||
.value_name = "encoder",
|
||||
.description = "Encoder ONNX file"},
|
||||
{.identifier = 'd',
|
||||
.access_letters = NULL,
|
||||
.access_name = "decoder",
|
||||
.value_name = "decoder",
|
||||
.description = "Decoder ONNX file"},
|
||||
{.identifier = 'j',
|
||||
.access_letters = NULL,
|
||||
.access_name = "joiner",
|
||||
.value_name = "joiner",
|
||||
.description = "Joiner ONNX file"},
|
||||
{.identifier = 'n',
|
||||
.access_letters = NULL,
|
||||
.access_name = "num-threads",
|
||||
.value_name = "num-threads",
|
||||
.description = "Number of threads"},
|
||||
{.identifier = 'p',
|
||||
.access_letters = NULL,
|
||||
.access_name = "provider",
|
||||
.value_name = "provider",
|
||||
.description = "Provider: cpu (default), cuda, coreml"},
|
||||
{.identifier = 'm',
|
||||
.access_letters = NULL,
|
||||
.access_name = "decoding-method",
|
||||
.value_name = "decoding-method",
|
||||
.description =
|
||||
"Decoding method: greedy_search (default), modified_beam_search"},
|
||||
{.identifier = 'f',
|
||||
.access_letters = NULL,
|
||||
.access_name = "hotwords-file",
|
||||
.value_name = "hotwords-file",
|
||||
.description = "The file containing hotwords, one words/phrases per line, "
|
||||
"and for each phrase the bpe/cjkchar are separated by a "
|
||||
"space. For example: ▁HE LL O ▁WORLD, 你 好 世 界"},
|
||||
{.identifier = 's',
|
||||
.access_letters = NULL,
|
||||
.access_name = "hotwords-score",
|
||||
.value_name = "hotwords-score",
|
||||
.description = "The bonus score for each token in hotwords. Used only "
|
||||
"when decoding_method is modified_beam_search"},
|
||||
};
|
||||
|
||||
const char *kUsage =
|
||||
R"(
|
||||
Usage:
|
||||
./bin/c-api-alsa \
|
||||
--tokens=/path/to/tokens.txt \
|
||||
--encoder=/path/to/encoder.onnx \
|
||||
--decoder=/path/to/decoder.onnx \
|
||||
--joiner=/path/to/decoder.onnx \
|
||||
device_name
|
||||
|
||||
The device name specifies which microphone to use in case there are several
|
||||
on your system. You can use
|
||||
|
||||
arecord -l
|
||||
|
||||
to find all available microphones on your computer. For instance, if it outputs
|
||||
|
||||
**** List of CAPTURE Hardware Devices ****
|
||||
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
|
||||
Subdevices: 1/1
|
||||
Subdevice #0: subdevice #0
|
||||
|
||||
and if you want to select card 3 and the device 0 on that card, please use:
|
||||
|
||||
plughw:3,0
|
||||
|
||||
as the device_name.
|
||||
)";
|
||||
|
||||
bool stop = false;
|
||||
|
||||
static void Handler(int sig) {
|
||||
stop = true;
|
||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
|
||||
}
|
||||
|
||||
int32_t main(int32_t argc, char *argv[]) {
|
||||
if (argc < 6) {
|
||||
fprintf(stderr, "%s\n", kUsage);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
signal(SIGINT, Handler);
|
||||
|
||||
SherpaOnnxOnlineRecognizerConfig config;
|
||||
memset(&config, 0, sizeof(config));
|
||||
|
||||
config.model_config.debug = 0;
|
||||
config.model_config.num_threads = 1;
|
||||
config.model_config.provider = "cpu";
|
||||
|
||||
config.decoding_method = "greedy_search";
|
||||
|
||||
config.max_active_paths = 4;
|
||||
|
||||
config.feat_config.sample_rate = 16000;
|
||||
config.feat_config.feature_dim = 80;
|
||||
|
||||
config.enable_endpoint = 1;
|
||||
config.rule1_min_trailing_silence = 2.4;
|
||||
config.rule2_min_trailing_silence = 1.2;
|
||||
config.rule3_min_utterance_length = 300;
|
||||
|
||||
cag_option_context context;
|
||||
char identifier;
|
||||
const char *value;
|
||||
|
||||
cag_option_prepare(&context, options, CAG_ARRAY_SIZE(options), argc, argv);
|
||||
|
||||
while (cag_option_fetch(&context)) {
|
||||
identifier = cag_option_get(&context);
|
||||
value = cag_option_get_value(&context);
|
||||
switch (identifier) {
|
||||
case 't':
|
||||
config.model_config.tokens = value;
|
||||
break;
|
||||
case 'e':
|
||||
config.model_config.transducer.encoder = value;
|
||||
break;
|
||||
case 'd':
|
||||
config.model_config.transducer.decoder = value;
|
||||
break;
|
||||
case 'j':
|
||||
config.model_config.transducer.joiner = value;
|
||||
break;
|
||||
case 'n':
|
||||
config.model_config.num_threads = atoi(value);
|
||||
break;
|
||||
case 'p':
|
||||
config.model_config.provider = value;
|
||||
break;
|
||||
case 'm':
|
||||
config.decoding_method = value;
|
||||
break;
|
||||
case 'f':
|
||||
config.hotwords_file = value;
|
||||
break;
|
||||
case 's':
|
||||
config.hotwords_score = atof(value);
|
||||
break;
|
||||
case 'h': {
|
||||
fprintf(stderr, "%s\n", kUsage);
|
||||
exit(0);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// do nothing as config already has valid default values
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config);
|
||||
SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
|
||||
|
||||
SherpaOnnxDisplay *display = CreateDisplay(50);
|
||||
int32_t segment_id = 0;
|
||||
|
||||
const char *device_name = argv[context.index];
|
||||
sherpa_onnx::Alsa alsa(device_name);
|
||||
fprintf(stderr, "Use recording device: %s\n", device_name);
|
||||
fprintf(stderr,
|
||||
"Please \033[32m\033[1mspeak\033[0m! Press \033[31m\033[1mCtrl + "
|
||||
"C\033[0m to exit\n");
|
||||
|
||||
int32_t expected_sample_rate = 16000;
|
||||
|
||||
if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
|
||||
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
|
||||
expected_sample_rate);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int32_t chunk = 0.1 * alsa.GetActualSampleRate();
|
||||
|
||||
std::string last_text;
|
||||
|
||||
int32_t segment_index = 0;
|
||||
|
||||
while (!stop) {
|
||||
const std::vector<float> &samples = alsa.Read(chunk);
|
||||
AcceptWaveform(stream, expected_sample_rate, samples.data(),
|
||||
samples.size());
|
||||
while (IsOnlineStreamReady(recognizer, stream)) {
|
||||
DecodeOnlineStream(recognizer, stream);
|
||||
}
|
||||
|
||||
const SherpaOnnxOnlineRecognizerResult *r =
|
||||
GetOnlineStreamResult(recognizer, stream);
|
||||
|
||||
std::string text = r->text;
|
||||
DestroyOnlineRecognizerResult(r);
|
||||
|
||||
if (!text.empty() && last_text != text) {
|
||||
last_text = text;
|
||||
|
||||
std::transform(text.begin(), text.end(), text.begin(),
|
||||
[](auto c) { return std::tolower(c); });
|
||||
|
||||
SherpaOnnxPrint(display, segment_index, text.c_str());
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
if (IsEndpoint(recognizer, stream)) {
|
||||
if (!text.empty()) {
|
||||
++segment_index;
|
||||
}
|
||||
Reset(recognizer, stream);
|
||||
}
|
||||
}
|
||||
|
||||
// free allocated resources
|
||||
DestroyDisplay(display);
|
||||
DestroyOnlineStream(stream);
|
||||
DestroyOnlineRecognizer(recognizer);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user