diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 95983cd8..478dd8ee 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -6,3 +6,9 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs) add_executable(offline-tts-c-api offline-tts-c-api.c) target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs) + +if(SHERPA_ONNX_HAS_ALSA) + add_subdirectory(./asr-microphone-example) +else() + message(WARNING "Not include ./asr-microphone-example since alsa is not available") +endif() diff --git a/c-api-examples/asr-microphone-example/CMakeLists.txt b/c-api-examples/asr-microphone-example/CMakeLists.txt new file mode 100644 index 00000000..1c486bb3 --- /dev/null +++ b/c-api-examples/asr-microphone-example/CMakeLists.txt @@ -0,0 +1,9 @@ + +add_executable(c-api-alsa c-api-alsa.cc alsa.cc) +target_link_libraries(c-api-alsa sherpa-onnx-c-api cargs) + +if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) + target_link_libraries(c-api-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) +else() + target_link_libraries(c-api-alsa asound) +endif() diff --git a/c-api-examples/asr-microphone-example/CPPLINT.cfg b/c-api-examples/asr-microphone-example/CPPLINT.cfg new file mode 100644 index 00000000..f1b97ab7 --- /dev/null +++ b/c-api-examples/asr-microphone-example/CPPLINT.cfg @@ -0,0 +1 @@ +exclude_files=alsa.cc|alsa.h diff --git a/c-api-examples/asr-microphone-example/README.md b/c-api-examples/asr-microphone-example/README.md new file mode 100644 index 00000000..50e24235 --- /dev/null +++ b/c-api-examples/asr-microphone-example/README.md @@ -0,0 +1,12 @@ +# Introduction + +This folder contains examples for real-time speech recognition from a microphone +using sherpa-onnx C API. + +**Note**: You can call C API from C++ files. + + +## ./c-api-alsa.cc + +This file uses alsa to read a microphone. It runs only on Linux. This file +does not support macOS or Windows. diff --git a/c-api-examples/asr-microphone-example/alsa.cc b/c-api-examples/asr-microphone-example/alsa.cc new file mode 120000 index 00000000..7acd97ce --- /dev/null +++ b/c-api-examples/asr-microphone-example/alsa.cc @@ -0,0 +1 @@ +../../sherpa-onnx/csrc/alsa.cc \ No newline at end of file diff --git a/c-api-examples/asr-microphone-example/alsa.h b/c-api-examples/asr-microphone-example/alsa.h new file mode 120000 index 00000000..cde29958 --- /dev/null +++ b/c-api-examples/asr-microphone-example/alsa.h @@ -0,0 +1 @@ +../../sherpa-onnx/csrc/alsa.h \ No newline at end of file diff --git a/c-api-examples/asr-microphone-example/c-api-alsa.cc b/c-api-examples/asr-microphone-example/c-api-alsa.cc new file mode 100644 index 00000000..8326462b --- /dev/null +++ b/c-api-examples/asr-microphone-example/c-api-alsa.cc @@ -0,0 +1,254 @@ +// c-api-examples/asr-microphone-example/c-api-alsa.cc +// Copyright (c) 2022-2024 Xiaomi Corporation + +#include +#include +#include +#include + +#include +#include // std::tolower +#include +#include + +#include "c-api-examples/asr-microphone-example/alsa.h" + +// NOTE: You don't need to use cargs.h in your own project. +// We use it in this file to parse commandline arguments +#include "cargs.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +static struct cag_option options[] = { + {.identifier = 'h', + .access_letters = "h", + .access_name = "help", + .description = "Show help"}, + {.identifier = 't', + .access_letters = NULL, + .access_name = "tokens", + .value_name = "tokens", + .description = "Tokens file"}, + {.identifier = 'e', + .access_letters = NULL, + .access_name = "encoder", + .value_name = "encoder", + .description = "Encoder ONNX file"}, + {.identifier = 'd', + .access_letters = NULL, + .access_name = "decoder", + .value_name = "decoder", + .description = "Decoder ONNX file"}, + {.identifier = 'j', + .access_letters = NULL, + .access_name = "joiner", + .value_name = "joiner", + .description = "Joiner ONNX file"}, + {.identifier = 'n', + .access_letters = NULL, + .access_name = "num-threads", + .value_name = "num-threads", + .description = "Number of threads"}, + {.identifier = 'p', + .access_letters = NULL, + .access_name = "provider", + .value_name = "provider", + .description = "Provider: cpu (default), cuda, coreml"}, + {.identifier = 'm', + .access_letters = NULL, + .access_name = "decoding-method", + .value_name = "decoding-method", + .description = + "Decoding method: greedy_search (default), modified_beam_search"}, + {.identifier = 'f', + .access_letters = NULL, + .access_name = "hotwords-file", + .value_name = "hotwords-file", + .description = "The file containing hotwords, one words/phrases per line, " + "and for each phrase the bpe/cjkchar are separated by a " + "space. For example: ▁HE LL O ▁WORLD, 你 好 世 界"}, + {.identifier = 's', + .access_letters = NULL, + .access_name = "hotwords-score", + .value_name = "hotwords-score", + .description = "The bonus score for each token in hotwords. Used only " + "when decoding_method is modified_beam_search"}, +}; + +const char *kUsage = + R"( +Usage: + ./bin/c-api-alsa \ + --tokens=/path/to/tokens.txt \ + --encoder=/path/to/encoder.onnx \ + --decoder=/path/to/decoder.onnx \ + --joiner=/path/to/decoder.onnx \ + device_name + +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and the device 0 on that card, please use: + + plughw:3,0 + +as the device_name. +)"; + +bool stop = false; + +static void Handler(int sig) { + stop = true; + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +int32_t main(int32_t argc, char *argv[]) { + if (argc < 6) { + fprintf(stderr, "%s\n", kUsage); + exit(0); + } + + signal(SIGINT, Handler); + + SherpaOnnxOnlineRecognizerConfig config; + memset(&config, 0, sizeof(config)); + + config.model_config.debug = 0; + config.model_config.num_threads = 1; + config.model_config.provider = "cpu"; + + config.decoding_method = "greedy_search"; + + config.max_active_paths = 4; + + config.feat_config.sample_rate = 16000; + config.feat_config.feature_dim = 80; + + config.enable_endpoint = 1; + config.rule1_min_trailing_silence = 2.4; + config.rule2_min_trailing_silence = 1.2; + config.rule3_min_utterance_length = 300; + + cag_option_context context; + char identifier; + const char *value; + + cag_option_prepare(&context, options, CAG_ARRAY_SIZE(options), argc, argv); + + while (cag_option_fetch(&context)) { + identifier = cag_option_get(&context); + value = cag_option_get_value(&context); + switch (identifier) { + case 't': + config.model_config.tokens = value; + break; + case 'e': + config.model_config.transducer.encoder = value; + break; + case 'd': + config.model_config.transducer.decoder = value; + break; + case 'j': + config.model_config.transducer.joiner = value; + break; + case 'n': + config.model_config.num_threads = atoi(value); + break; + case 'p': + config.model_config.provider = value; + break; + case 'm': + config.decoding_method = value; + break; + case 'f': + config.hotwords_file = value; + break; + case 's': + config.hotwords_score = atof(value); + break; + case 'h': { + fprintf(stderr, "%s\n", kUsage); + exit(0); + break; + } + default: + // do nothing as config already has valid default values + break; + } + } + + SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&config); + SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer); + + SherpaOnnxDisplay *display = CreateDisplay(50); + int32_t segment_id = 0; + + const char *device_name = argv[context.index]; + sherpa_onnx::Alsa alsa(device_name); + fprintf(stderr, "Use recording device: %s\n", device_name); + fprintf(stderr, + "Please \033[32m\033[1mspeak\033[0m! Press \033[31m\033[1mCtrl + " + "C\033[0m to exit\n"); + + int32_t expected_sample_rate = 16000; + + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), + expected_sample_rate); + exit(-1); + } + + int32_t chunk = 0.1 * alsa.GetActualSampleRate(); + + std::string last_text; + + int32_t segment_index = 0; + + while (!stop) { + const std::vector &samples = alsa.Read(chunk); + AcceptWaveform(stream, expected_sample_rate, samples.data(), + samples.size()); + while (IsOnlineStreamReady(recognizer, stream)) { + DecodeOnlineStream(recognizer, stream); + } + + const SherpaOnnxOnlineRecognizerResult *r = + GetOnlineStreamResult(recognizer, stream); + + std::string text = r->text; + DestroyOnlineRecognizerResult(r); + + if (!text.empty() && last_text != text) { + last_text = text; + + std::transform(text.begin(), text.end(), text.begin(), + [](auto c) { return std::tolower(c); }); + + SherpaOnnxPrint(display, segment_index, text.c_str()); + fflush(stderr); + } + + if (IsEndpoint(recognizer, stream)) { + if (!text.empty()) { + ++segment_index; + } + Reset(recognizer, stream); + } + } + + // free allocated resources + DestroyDisplay(display); + DestroyOnlineStream(stream); + DestroyOnlineRecognizer(recognizer); + fprintf(stderr, "\n"); + + return 0; +} diff --git a/c-api-examples/decode-file-c-api.c b/c-api-examples/decode-file-c-api.c index 542cab9c..46cb11a8 100644 --- a/c-api-examples/decode-file-c-api.c +++ b/c-api-examples/decode-file-c-api.c @@ -157,7 +157,7 @@ int32_t main(int32_t argc, char *argv[]) { break; } default: - // do nothing as config already have valid default values + // do nothing as config already has valid default values break; } }