diff --git a/.github/workflows/mfc.yaml b/.github/workflows/mfc.yaml
index 44c65da4..cdf5f9c1 100644
--- a/.github/workflows/mfc.yaml
+++ b/.github/workflows/mfc.yaml
@@ -98,6 +98,7 @@ jobs:
cd mfc-examples/$arch/Release
cp StreamingSpeechRecognition.exe sherpa-onnx-streaming-${SHERPA_ONNX_VERSION}.exe
+ cp NonStreamingSpeechRecognition.exe sherpa-onnx-non-streaming-${SHERPA_ONNX_VERSION}.exe
ls -lh
- name: Upload artifact
@@ -106,10 +107,24 @@ jobs:
name: streaming-speech-recognition-${{ matrix.arch }}
path: ./mfc-examples/${{ matrix.arch }}/Release/StreamingSpeechRecognition.exe
- - name: Release pre-compiled binaries and libs for macOS
+ - name: Upload artifact
+ uses: actions/upload-artifact@v2
+ with:
+ name: non-streaming-speech-recognition-${{ matrix.arch }}
+ path: ./mfc-examples/${{ matrix.arch }}/Release/NonStreamingSpeechRecognition.exe
+
+ - name: Release pre-compiled binaries and libs for Windows ${{ matrix.arch }}
if: env.RELEASE == 'true'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
- file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx*.exe
+ file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx-streaming-*.exe
+
+ - name: Release pre-compiled binaries and libs for Windows ${{ matrix.arch }}
+ if: env.RELEASE == 'true'
+ uses: svenstaro/upload-release-action@v2
+ with:
+ file_glob: true
+ overwrite: true
+ file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx-non-streaming-*.exe
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 50bedd94..033d2ceb 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -113,7 +113,7 @@ function(download_onnxruntime)
set(onnxruntime_URL "https://huggingface.co/csukuangfj/onnxruntime-libs/resolve/main/onnxruntime-win-x86-static-1.15.1.tar.bz2")
set(onnxruntime_URL2 "")
- set(onnxruntime_HASH "SHA256=a2b33a3e8a1f89cddf303f0a97a5a88f4202579c653cfb29158c8cf7da3734eb")
+ set(onnxruntime_HASH "SHA256=94d9a30976b5c4a5dff7508d00f141835916e5a36315d5f53be9b3edb85148b5")
endif()
if(SHERPA_ONNX_ENABLE_GPU)
@@ -161,7 +161,7 @@ function(download_onnxruntime)
set(onnxruntime_URL "https://huggingface.co/csukuangfj/onnxruntime-libs/resolve/main/onnxruntime-win-x64-static-1.15.1.tar.bz2")
set(onnxruntime_URL2 "")
- set(onnxruntime_HASH "SHA256=f5c19ac1fc6a61c78a231a41df10aede2586665ab397bdc3f007eb8d2c8d4a19")
+ set(onnxruntime_HASH "SHA256=c809a8510a89b8b37ae7d563c39229db22bac8fbefcbfe5c81a60b367d065b1b")
endif()
endif()
# After downloading, it contains:
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp
new file mode 100644
index 00000000..66a85c49
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp
@@ -0,0 +1,86 @@
+
+// NonStreamingSpeechRecognition.cpp : Defines the class behaviors for the
+// application.
+//
+
+// clang-format off
+#include "pch.h"
+#include "framework.h"
+#include "NonStreamingSpeechRecognitionDlg.h"
+#include "NonStreamingSpeechRecognition.h"
+// clang-format on
+
+#ifdef _DEBUG
+#define new DEBUG_NEW
+#endif
+
+// CNonStreamingSpeechRecognitionApp
+
+BEGIN_MESSAGE_MAP(CNonStreamingSpeechRecognitionApp, CWinApp)
+ON_COMMAND(ID_HELP, &CWinApp::OnHelp)
+END_MESSAGE_MAP()
+
+// CNonStreamingSpeechRecognitionApp construction
+
+CNonStreamingSpeechRecognitionApp::CNonStreamingSpeechRecognitionApp() {
+ // TODO: add construction code here,
+ // Place all significant initialization in InitInstance
+}
+
+// The one and only CNonStreamingSpeechRecognitionApp object
+
+CNonStreamingSpeechRecognitionApp theApp;
+
+// CNonStreamingSpeechRecognitionApp initialization
+
+BOOL CNonStreamingSpeechRecognitionApp::InitInstance() {
+ CWinApp::InitInstance();
+
+ // Create the shell manager, in case the dialog contains
+ // any shell tree view or shell list view controls.
+ CShellManager *pShellManager = new CShellManager;
+
+ // Activate "Windows Native" visual manager for enabling themes in MFC
+ // controls
+ CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows));
+
+ // Standard initialization
+ // If you are not using these features and wish to reduce the size
+ // of your final executable, you should remove from the following
+ // the specific initialization routines you do not need
+ // Change the registry key under which our settings are stored
+ // TODO: You should modify this string to be something appropriate
+ // such as the name of your company or organization
+ SetRegistryKey(_T("Local AppWizard-Generated Applications"));
+
+ CNonStreamingSpeechRecognitionDlg dlg;
+ m_pMainWnd = &dlg;
+ INT_PTR nResponse = dlg.DoModal();
+ if (nResponse == IDOK) {
+ // TODO: Place code here to handle when the dialog is
+ // dismissed with OK
+ } else if (nResponse == IDCANCEL) {
+ // TODO: Place code here to handle when the dialog is
+ // dismissed with Cancel
+ } else if (nResponse == -1) {
+ TRACE(traceAppMsg, 0,
+ "Warning: dialog creation failed, so application is terminating "
+ "unexpectedly.\n");
+ TRACE(traceAppMsg, 0,
+ "Warning: if you are using MFC controls on the dialog, you cannot "
+ "#define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n");
+ }
+
+ // Delete the shell manager created above.
+ if (pShellManager != nullptr) {
+ delete pShellManager;
+ }
+
+#if !defined(_AFXDLL) && !defined(_AFX_NO_MFC_CONTROLS_IN_DIALOGS)
+ ControlBarCleanUp();
+#endif
+
+ // Since the dialog has been closed, return FALSE so that we exit the
+ // application, rather than start the application's message pump.
+ return FALSE;
+}
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h
new file mode 100644
index 00000000..bb176491
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h
@@ -0,0 +1,31 @@
+
+// NonStreamingSpeechRecognition.h : main header file for the PROJECT_NAME
+// application
+//
+
+#pragma once
+
+#ifndef __AFXWIN_H__
+#error "include 'pch.h' before including this file for PCH"
+#endif
+
+#include "resource.h" // main symbols
+
+// CNonStreamingSpeechRecognitionApp:
+// See NonStreamingSpeechRecognition.cpp for the implementation of this class
+//
+
+class CNonStreamingSpeechRecognitionApp : public CWinApp {
+ public:
+ CNonStreamingSpeechRecognitionApp();
+
+ // Overrides
+ public:
+ virtual BOOL InitInstance();
+
+ // Implementation
+
+ DECLARE_MESSAGE_MAP()
+};
+
+extern CNonStreamingSpeechRecognitionApp theApp;
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc
new file mode 100644
index 00000000..7730ef76
Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc differ
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj
new file mode 100644
index 00000000..b831c06f
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj
@@ -0,0 +1,219 @@
+
+
+
+
+ Debug
+ Win32
+
+
+ Release
+ Win32
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+ 17.0
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}
+ MFCProj
+ NonStreamingSpeechRecognition
+ 10.0
+
+
+
+ Application
+ true
+ v143
+ Unicode
+ Static
+
+
+ Application
+ false
+ v143
+ true
+ Unicode
+ Static
+
+
+ Application
+ true
+ v143
+ Unicode
+ Static
+
+
+ Application
+ false
+ v143
+ true
+ Unicode
+ Static
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ false
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+
+ Use
+ Level3
+ true
+ true
+ true
+ _WINDOWS;NDEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+ true
+ true
+
+
+ false
+ true
+ NDEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ NDEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+
+
+ false
+ true
+ _DEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ _DEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ _WINDOWS;_DEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+
+
+ false
+ true
+ _DEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ _DEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ true
+ true
+ WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+ true
+ true
+
+
+ false
+ true
+ NDEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ NDEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create
+ Create
+ Create
+ Create
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters
new file mode 100644
index 00000000..32434b5a
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters
@@ -0,0 +1,63 @@
+
+
+
+
+ {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
+ cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
+
+
+ {93995380-89BD-4b04-88EB-625FBE52EBFB}
+ h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
+
+
+ {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
+ rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
+
+
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+
+
+ Source Files
+
+
+ Source Files
+
+
+ Source Files
+
+
+
+
+ Resource Files
+
+
+
+
+ Resource Files
+
+
+
+
+ Resource Files
+
+
+
\ No newline at end of file
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
new file mode 100644
index 00000000..52b699c0
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp
@@ -0,0 +1,473 @@
+
+// NonStreamingSpeechRecognitionDlg.cpp : implementation file
+//
+
+// clang-format off
+#include "pch.h"
+#include "framework.h"
+#include "afxdialogex.h"
+#include "NonStreamingSpeechRecognition.h"
+#include "NonStreamingSpeechRecognitionDlg.h"
+// clang-format on
+
+#include
+#include
+#include
+#include
+
+#ifdef _DEBUG
+#define new DEBUG_NEW
+#endif
+
+Microphone::Microphone() {
+ PaError err = Pa_Initialize();
+ if (err != paNoError) {
+ fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
+ exit(-2);
+ }
+}
+
+Microphone::~Microphone() {
+ PaError err = Pa_Terminate();
+ if (err != paNoError) {
+ fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err));
+ exit(-2);
+ }
+}
+
+// see
+// https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
+static std::wstring Utf8ToUtf16(const std::string &utf8) {
+ std::vector unicode;
+ size_t i = 0;
+ while (i < utf8.size()) {
+ unsigned long uni;
+ size_t todo;
+ bool error = false;
+ unsigned char ch = utf8[i++];
+ if (ch <= 0x7F) {
+ uni = ch;
+ todo = 0;
+ } else if (ch <= 0xBF) {
+ throw std::logic_error("not a UTF-8 string");
+ } else if (ch <= 0xDF) {
+ uni = ch & 0x1F;
+ todo = 1;
+ } else if (ch <= 0xEF) {
+ uni = ch & 0x0F;
+ todo = 2;
+ } else if (ch <= 0xF7) {
+ uni = ch & 0x07;
+ todo = 3;
+ } else {
+ throw std::logic_error("not a UTF-8 string");
+ }
+ for (size_t j = 0; j < todo; ++j) {
+ if (i == utf8.size()) throw std::logic_error("not a UTF-8 string");
+ unsigned char ch = utf8[i++];
+ if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string");
+ uni <<= 6;
+ uni += ch & 0x3F;
+ }
+ if (uni >= 0xD800 && uni <= 0xDFFF)
+ throw std::logic_error("not a UTF-8 string");
+ if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string");
+ unicode.push_back(uni);
+ }
+ std::wstring utf16;
+ for (size_t i = 0; i < unicode.size(); ++i) {
+ unsigned long uni = unicode[i];
+ if (uni <= 0xFFFF) {
+ utf16 += (wchar_t)uni;
+ } else {
+ uni -= 0x10000;
+ utf16 += (wchar_t)((uni >> 10) + 0xD800);
+ utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);
+ }
+ }
+ return utf16;
+}
+
+static std::string Cat(const std::vector &results) {
+ std::ostringstream os;
+ std::string sep;
+
+ int i = 0;
+ for (i = 0; i != results.size(); ++i) {
+ os << sep << i << ": " << results[i];
+ sep = "\r\n";
+ }
+
+ return os.str();
+}
+
+// CNonStreamingSpeechRecognitionDlg dialog
+
+CNonStreamingSpeechRecognitionDlg::CNonStreamingSpeechRecognitionDlg(
+ CWnd *pParent /*=nullptr*/)
+ : CDialogEx(IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG, pParent) {
+ m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME);
+}
+
+CNonStreamingSpeechRecognitionDlg::~CNonStreamingSpeechRecognitionDlg() {
+ if (recognizer_) {
+ DestroyOfflineRecognizer(recognizer_);
+ recognizer_ = nullptr;
+ }
+}
+
+void CNonStreamingSpeechRecognitionDlg::DoDataExchange(CDataExchange *pDX) {
+ CDialogEx::DoDataExchange(pDX);
+ DDX_Control(pDX, IDC_EDIT1, my_text_);
+ DDX_Control(pDX, IDOK, my_btn_);
+}
+
+BEGIN_MESSAGE_MAP(CNonStreamingSpeechRecognitionDlg, CDialogEx)
+ON_WM_PAINT()
+ON_WM_QUERYDRAGICON()
+ON_BN_CLICKED(IDOK, &CNonStreamingSpeechRecognitionDlg::OnBnClickedOk)
+END_MESSAGE_MAP()
+
+// CNonStreamingSpeechRecognitionDlg message handlers
+
+BOOL CNonStreamingSpeechRecognitionDlg::OnInitDialog() {
+ CDialogEx::OnInitDialog();
+
+ // Set the icon for this dialog. The framework does this automatically
+ // when the application's main window is not a dialog
+ SetIcon(m_hIcon, TRUE); // Set big icon
+ SetIcon(m_hIcon, FALSE); // Set small icon
+
+ // TODO: Add extra initialization here
+ InitMicrophone();
+
+ return TRUE; // return TRUE unless you set the focus to a control
+}
+
+// If you add a minimize button to your dialog, you will need the code below
+// to draw the icon. For MFC applications using the document/view model,
+// this is automatically done for you by the framework.
+
+void CNonStreamingSpeechRecognitionDlg::OnPaint() {
+ if (IsIconic()) {
+ CPaintDC dc(this); // device context for painting
+
+ SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()),
+ 0);
+
+ // Center icon in client rectangle
+ int cxIcon = GetSystemMetrics(SM_CXICON);
+ int cyIcon = GetSystemMetrics(SM_CYICON);
+ CRect rect;
+ GetClientRect(&rect);
+ int x = (rect.Width() - cxIcon + 1) / 2;
+ int y = (rect.Height() - cyIcon + 1) / 2;
+
+ // Draw the icon
+ dc.DrawIcon(x, y, m_hIcon);
+ } else {
+ CDialogEx::OnPaint();
+ }
+}
+
+// The system calls this function to obtain the cursor to display while the user
+// drags
+// the minimized window.
+HCURSOR CNonStreamingSpeechRecognitionDlg::OnQueryDragIcon() {
+ return static_cast(m_hIcon);
+}
+
+static int32_t RecordCallback(const void *input_buffer,
+ void * /*output_buffer*/,
+ unsigned long frames_per_buffer, // NOLINT
+ const PaStreamCallbackTimeInfo * /*time_info*/,
+ PaStreamCallbackFlags /*status_flags*/,
+ void *user_data) {
+ auto dlg = reinterpret_cast(user_data);
+ auto begin = reinterpret_cast(input_buffer);
+ auto end = begin + frames_per_buffer;
+ dlg->samples_.insert(dlg->samples_.end(), begin, end);
+
+ return dlg->started_ ? paContinue : paComplete;
+}
+
+void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() {
+ if (!recognizer_) {
+ AppendLineToMultilineEditCtrl("Creating recognizer...");
+ AppendLineToMultilineEditCtrl("It will take several seconds. Please wait");
+ InitRecognizer();
+ if (!recognizer_) {
+ // failed to create the recognizer
+ return;
+ }
+ AppendLineToMultilineEditCtrl("Recognizer created!");
+ }
+
+ if (!started_) {
+ samples_.clear();
+ started_ = true;
+
+ PaStreamParameters param;
+ param.device = Pa_GetDefaultInputDevice();
+ const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device);
+ param.channelCount = 1;
+ param.sampleFormat = paFloat32;
+ param.suggestedLatency = info->defaultLowInputLatency;
+ param.hostApiSpecificStreamInfo = nullptr;
+ float sample_rate = config_.feat_config.sample_rate;
+ pa_stream_ = nullptr;
+ PaError err =
+ Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */
+ sample_rate,
+ 0, // frames per buffer
+ paClipOff, // we won't output out of range samples
+ // so don't bother clipping them
+ RecordCallback, this);
+ if (err != paNoError) {
+ AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
+ Pa_GetErrorText(err));
+ my_btn_.EnableWindow(FALSE);
+ return;
+ }
+
+ err = Pa_StartStream(pa_stream_);
+ if (err != paNoError) {
+ AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
+ Pa_GetErrorText(err));
+ my_btn_.EnableWindow(FALSE);
+ return;
+ }
+ AppendLineToMultilineEditCtrl(
+ "\r\nStarted! Please speak and click stop.\r\n");
+ my_btn_.SetWindowText(_T("Stop"));
+
+ } else {
+ started_ = false;
+
+ Pa_Sleep(200); // sleep for 200ms
+ if (pa_stream_) {
+ PaError err = Pa_CloseStream(pa_stream_);
+ if (err != paNoError) {
+ AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") +
+ Pa_GetErrorText(err));
+ my_btn_.EnableWindow(FALSE);
+ return;
+ }
+ }
+ pa_stream_ = nullptr;
+
+ SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_);
+
+ AcceptWaveformOffline(stream, config_.feat_config.sample_rate,
+ samples_.data(), samples_.size());
+ DecodeOfflineStream(recognizer_, stream);
+ SherpaOnnxOfflineRecognizerResult *r = GetOfflineStreamResult(stream);
+ results_.emplace_back(r->text);
+
+ auto str = Utf8ToUtf16(Cat(results_).c_str());
+ my_text_.SetWindowText(str.c_str());
+ my_text_.SetFocus();
+ my_text_.SetSel(-1);
+
+ DestroyOfflineRecognizerResult(r);
+
+ DestroyOfflineStream(stream);
+ // AfxMessageBox("Stopped", MB_OK);
+ my_btn_.SetWindowText(_T("Start"));
+ AppendLineToMultilineEditCtrl("\r\nStopped. Please click start and speak");
+ }
+}
+
+void CNonStreamingSpeechRecognitionDlg::InitMicrophone() {
+ int default_device = Pa_GetDefaultInputDevice();
+ int device_count = Pa_GetDeviceCount();
+ if (default_device == paNoDevice) {
+ // CString str;
+ // str.Format(_T("No default input device found!"));
+ // AfxMessageBox(str, MB_OK | MB_ICONSTOP);
+ // exit(-1);
+ AppendLineToMultilineEditCtrl("No default input device found!");
+ my_btn_.EnableWindow(FALSE);
+ return;
+ }
+ AppendLineToMultilineEditCtrl(std::string("Selected device ") +
+ Pa_GetDeviceInfo(default_device)->name);
+}
+
+bool CNonStreamingSpeechRecognitionDlg::Exists(const std::string &filename) {
+ std::ifstream is(filename);
+ return is.good();
+}
+
+void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() {
+ my_btn_.EnableWindow(FALSE);
+ std::string msg =
+ "\r\nPlease go to\r\n"
+ "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html "
+ "\r\n";
+ msg += "to download a non-streaming model, i.e., an offline model.\r\n";
+ msg +=
+ "You need to rename them to encoder.onnx, decoder.onnx, and "
+ "joiner.onnx correspoondingly.\r\n\r\n";
+ msg += "It supports both transducer models and paraformer models.\r\n\r\n";
+ msg +=
+ "We give two examples below to show you how to download models\r\n\r\n";
+ msg += "(1) Transducer\r\n\r\n";
+ msg +=
+ "We use "
+ "https://huggingface.co/pkufool/"
+ "icefall-asr-zipformer-wenetspeech-20230615 below\r\n";
+ msg +=
+ "wget "
+ "https://huggingface.co/pkufool/"
+ "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/"
+ "encoder-epoch-12-avg-4.onnx\r\n";
+ msg +=
+ "wget "
+ "https://huggingface.co/pkufool/"
+ "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/"
+ "decoder-epoch-12-avg-4.onnx\r\n";
+ msg +=
+ "wget "
+ "https://huggingface.co/pkufool/"
+ "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/"
+ "joiner-epoch-12-avg-4.onnx\r\n";
+ msg += "\r\n Now rename them\r\n";
+ msg += "mv encoder-epoch-12-avg-4.onnx encoder.onnx\r\n";
+ msg += "mv decoder-epoch-12-avg-4.onnx decoder.onnx\r\n";
+ msg += "mv joiner-epoch-12-avg-4.onnx joiner.onnx\r\n\r\n";
+ msg += "(2) Paraformer\r\n\r\n";
+ msg +=
+ "wget "
+ "https://huggingface.co/csukuangfj/"
+ "sherpa-onnx-paraformer-zh-2023-03-28/resolve/main/model.onnx\r\n";
+ msg +=
+ "wget "
+ "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/"
+ "resolve/main/tokens.txt\r\n\r\n";
+ msg += "\r\n Now rename them\r\n";
+ msg += "mv model.onnx paraformer.onnx\r\n";
+ msg += "\r\n";
+ msg += "That's it!\r\n";
+
+ AppendLineToMultilineEditCtrl(msg);
+}
+
+void CNonStreamingSpeechRecognitionDlg::InitParaformer() {
+ std::string paraformer = "./paraformer.onnx";
+ std::string tokens = "./tokens.txt";
+
+ bool is_ok = true;
+
+ if (Exists("./paraformer.int8.onnx")) {
+ paraformer = "./paraformer.int8.onnx";
+ } else if (!Exists(paraformer)) {
+ std::string msg = paraformer + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!Exists(tokens)) {
+ std::string msg = tokens + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!is_ok) {
+ ShowInitRecognizerHelpMessage();
+ return;
+ }
+
+ memset(&config_, 0, sizeof(config_));
+
+ config_.feat_config.sample_rate = 16000;
+ config_.feat_config.feature_dim = 80;
+
+ config_.model_config.paraformer.model = paraformer.c_str();
+ config_.model_config.tokens = tokens.c_str();
+ config_.model_config.num_threads = 1;
+ config_.model_config.debug = 1;
+
+ config_.decoding_method = "greedy_search";
+ config_.max_active_paths = 4;
+
+ recognizer_ = CreateOfflineRecognizer(&config_);
+}
+
+void CNonStreamingSpeechRecognitionDlg::InitRecognizer() {
+ if (Exists("./paraformer.onnx") || Exists("./paraformer.int8.onnx")) {
+ InitParaformer();
+ return;
+ }
+
+ // assume it is transducer
+
+ std::string encoder = "./encoder.onnx";
+ std::string decoder = "./decoder.onnx";
+ std::string joiner = "./joiner.onnx";
+ std::string tokens = "./tokens.txt";
+
+ bool is_ok = true;
+ if (!Exists(encoder)) {
+ std::string msg = encoder + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!Exists(decoder)) {
+ std::string msg = decoder + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!Exists(joiner)) {
+ std::string msg = joiner + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!Exists(tokens)) {
+ std::string msg = tokens + " does not exist!";
+ AppendLineToMultilineEditCtrl(msg);
+ is_ok = false;
+ }
+
+ if (!is_ok) {
+ ShowInitRecognizerHelpMessage();
+ return;
+ }
+ memset(&config_, 0, sizeof(config_));
+
+ config_.feat_config.sample_rate = 16000;
+ config_.feat_config.feature_dim = 80;
+
+ config_.model_config.transducer.encoder = encoder.c_str();
+ config_.model_config.transducer.decoder = decoder.c_str();
+ config_.model_config.transducer.joiner = joiner.c_str();
+ config_.model_config.tokens = tokens.c_str();
+ config_.model_config.num_threads = 1;
+ config_.model_config.debug = 0;
+
+ config_.decoding_method = "greedy_search";
+ config_.max_active_paths = 4;
+
+ recognizer_ = CreateOfflineRecognizer(&config_);
+}
+
+void CNonStreamingSpeechRecognitionDlg::AppendTextToEditCtrl(
+ const std::string &s) {
+ // get the initial text length
+ int nLength = my_text_.GetWindowTextLength();
+ // put the selection at the end of text
+ my_text_.SetSel(nLength, nLength);
+ // replace the selection
+
+ std::wstring wstr = Utf8ToUtf16(s);
+
+ my_text_.ReplaceSel(wstr.c_str());
+}
+
+void CNonStreamingSpeechRecognitionDlg::AppendLineToMultilineEditCtrl(
+ const std::string &s) {
+ AppendTextToEditCtrl("\r\n" + s);
+}
diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h
new file mode 100644
index 00000000..e364bc58
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h
@@ -0,0 +1,73 @@
+
+// NonStreamingSpeechRecognitionDlg.h : header file
+//
+
+#pragma once
+
+#include
+#include
+
+#include "portaudio.h"
+#include "sherpa-onnx/c-api/c-api.h"
+
+class Microphone {
+ public:
+ Microphone();
+ ~Microphone();
+};
+
+// CNonStreamingSpeechRecognitionDlg dialog
+class CNonStreamingSpeechRecognitionDlg : public CDialogEx {
+ // Construction
+ public:
+ CNonStreamingSpeechRecognitionDlg(
+ CWnd *pParent = nullptr); // standard constructor
+ ~CNonStreamingSpeechRecognitionDlg();
+
+// Dialog Data
+#ifdef AFX_DESIGN_TIME
+ enum { IDD = IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG };
+#endif
+
+ protected:
+ virtual void DoDataExchange(CDataExchange *pDX); // DDX/DDV support
+
+ // Implementation
+ protected:
+ HICON m_hIcon;
+
+ // Generated message map functions
+ virtual BOOL OnInitDialog();
+ afx_msg void OnPaint();
+ afx_msg HCURSOR OnQueryDragIcon();
+ DECLARE_MESSAGE_MAP()
+ public:
+ afx_msg void OnBnClickedOk();
+ int RunThread();
+
+ private:
+ Microphone mic_;
+
+ SherpaOnnxOfflineRecognizer *recognizer_ = nullptr;
+ SherpaOnnxOfflineRecognizerConfig config_;
+
+ PaStream *pa_stream_ = nullptr;
+ CButton my_btn_;
+ CEdit my_text_;
+ std::vector results_;
+
+ public:
+ bool started_ = false;
+ std::vector samples_;
+
+ private:
+ void AppendTextToEditCtrl(const std::string &s);
+ void AppendLineToMultilineEditCtrl(const std::string &s);
+ void InitMicrophone();
+
+ bool Exists(const std::string &filename);
+ void InitRecognizer();
+
+ void InitParaformer();
+ void ShowInitRecognizerHelpMessage();
+};
diff --git a/mfc-examples/NonStreamingSpeechRecognition/Resource.h b/mfc-examples/NonStreamingSpeechRecognition/Resource.h
new file mode 100644
index 00000000..69eeecfa
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/Resource.h
@@ -0,0 +1,18 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by NonStreamingSpeechRecognition.rc
+//
+#define IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG 102
+#define IDR_MAINFRAME 128
+#define IDC_EDIT1 1000
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE 130
+#define _APS_NEXT_COMMAND_VALUE 32771
+#define _APS_NEXT_CONTROL_VALUE 1001
+#define _APS_NEXT_SYMED_VALUE 101
+#endif
+#endif
diff --git a/mfc-examples/NonStreamingSpeechRecognition/framework.h b/mfc-examples/NonStreamingSpeechRecognition/framework.h
new file mode 100644
index 00000000..65e02c32
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/framework.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#ifndef VC_EXTRALEAN
+#define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers
+#endif
+
+#include "targetver.h"
+
+#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be
+ // explicit
+
+// turns off MFC's hiding of some common and often safely ignored warning
+// messages
+#define _AFX_ALL_WARNINGS
+
+#include // MFC extensions
+#include // MFC core and standard components
+
+#ifndef _AFX_NO_OLE_SUPPORT
+#include // MFC support for Internet Explorer 4 Common Controls
+#endif
+#ifndef _AFX_NO_AFXCMN_SUPPORT
+#include // MFC support for Windows Common Controls
+#endif // _AFX_NO_AFXCMN_SUPPORT
+
+#include // MFC support for ribbons and control bars
diff --git a/mfc-examples/NonStreamingSpeechRecognition/pch.cpp b/mfc-examples/NonStreamingSpeechRecognition/pch.cpp
new file mode 100644
index 00000000..00df68aa
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/pch.cpp
@@ -0,0 +1,6 @@
+// pch.cpp: source file corresponding to the pre-compiled header
+
+#include "pch.h"
+
+// When you are using pre-compiled headers, this source file is necessary for
+// compilation to succeed.
diff --git a/mfc-examples/NonStreamingSpeechRecognition/pch.h b/mfc-examples/NonStreamingSpeechRecognition/pch.h
new file mode 100644
index 00000000..4e7f5afb
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/pch.h
@@ -0,0 +1,15 @@
+// pch.h: This is a precompiled header file.
+// Files listed below are compiled only once, improving build performance for
+// future builds. This also affects IntelliSense performance, including code
+// completion and many code browsing features. However, files listed here are
+// ALL re-compiled if any one of them is updated between builds. Do not add
+// files here that you will be updating frequently as this negates the
+// performance advantage.
+
+#ifndef PCH_H
+#define PCH_H
+
+// add headers that you want to pre-compile here
+#include "framework.h"
+
+#endif // PCH_H
diff --git a/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico
new file mode 100644
index 00000000..d56fbcdf
Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico differ
diff --git a/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2 b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2
new file mode 100644
index 00000000..629a3829
Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2 differ
diff --git a/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props
new file mode 100644
index 00000000..f0e609d3
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props
@@ -0,0 +1,50 @@
+
+
+
+
+
+ ..\..\build
+ ..\..\build\install
+
+ sherpa-onnx-portaudio_static.lib;
+ sherpa-onnx-c-api.lib;
+ sherpa-onnx-core.lib;
+ kaldi-native-fbank-core.lib;
+ absl_base.lib;
+ absl_city.lib;
+ absl_hash.lib;
+ absl_low_level_hash.lib;
+ absl_raw_hash_set.lib;
+ absl_raw_logging_internal.lib;
+ absl_throw_delegate.lib;
+ clog.lib;
+ cpuinfo.lib;
+ flatbuffers.lib;
+ libprotobuf-lite.lib;
+ onnx.lib;
+ onnx_proto.lib;
+ onnxruntime_common.lib;
+ onnxruntime_flatbuffers.lib;
+ onnxruntime_framework.lib;
+ onnxruntime_graph.lib;
+ onnxruntime_mlas.lib;
+ onnxruntime_optimizer.lib;
+ onnxruntime_providers.lib;
+ onnxruntime_session.lib;
+ onnxruntime_util.lib;
+ re2.lib;
+
+
+
+
+
+ $(SherpaOnnxBuildDirectory)\_deps\portaudio-src\include;
+ $(SherpaOnnxInstallDirectory)\include;%(AdditionalIncludeDirectories)
+
+
+ $(SherpaOnnxInstallDirectory)\lib;%(AdditionalLibraryDirectories)
+ $(SherpaOnnxLibraries);
+
+
+
+
diff --git a/mfc-examples/NonStreamingSpeechRecognition/targetver.h b/mfc-examples/NonStreamingSpeechRecognition/targetver.h
new file mode 100644
index 00000000..87fe2aec
--- /dev/null
+++ b/mfc-examples/NonStreamingSpeechRecognition/targetver.h
@@ -0,0 +1,9 @@
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform,
+// include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you wish
+// to support before including SDKDDKVer.h.
+
+#include
diff --git a/mfc-examples/README.md b/mfc-examples/README.md
index 6d4ada56..8e35cd29 100644
--- a/mfc-examples/README.md
+++ b/mfc-examples/README.md
@@ -3,11 +3,19 @@
This directory contains examples showing how to use Next-gen Kaldi in MFC
for speech recognition.
-Caution: You need to use Windows and install Visual Studio in order to run it.
+Caution: You need to use Windows and install Visual Studio 2022 in order to
+compile it.
+
+Hint: If you don't want to install Visual Studio, you can find below
+about how to download pre-compiled `exe`.
+
We use bash script below to demonstrate how to use it. Please change
the commands accordingly for Windows.
-## Streaming speech recognition
+## How to compile
+
+
+First, we need to compile sherpa-onnx:
```bash
mkdir -p $HOME/open-source
@@ -19,7 +27,6 @@ mkdir build
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=./install ..
cmake --build . --config Release --target install
-
cd ../mfc-examples
msbuild ./mfc-examples.sln /property:Configuration=Release /property:Platform=x64
@@ -27,26 +34,13 @@ msbuild ./mfc-examples.sln /property:Configuration=Release /property:Platform=x6
# now run the program
./x64/Release/StreamingSpeechRecognition.exe
+./x64/Release/NonStreamingSpeechRecognition.exe
```
-Note that we also need to download pre-trained models. Please
-refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html
-for a list of streaming models.
+If you don't want to compile the project by yourself, you can download
+pre-compiled `exe` from https://github.com/k2-fsa/sherpa-onnx/releases
-We use the following model for demonstration.
+For instance, you can use the following addresses:
-```bash
-cd $HOME/open-source/sherpa-onnx/mfc-examples/x64/Release
-wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx
-wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx
-wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx
-wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt
-
-# now rename
-mv encoder-epoch-12-avg-4-chunk-16-left-128.onnx encoder.onnx
-mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx
-mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx
-
-# Now run it!
-./StreamingSpeechRecognition.exe
-```
+ - https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.5.1/sherpa-onnx-streaming-v1.5.1.exe
+ - https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.5.1/sherpa-onnx-non-streaming-v1.5.1.exe
diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp
index 3d81cf2f..e151866b 100644
--- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp
+++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp
@@ -3,12 +3,14 @@
// application.
//
+// clang-format off
#include "pch.h"
#include "framework.h"
+// clang-format on
#include "StreamingSpeechRecognition.h"
-#include "StreamingSpeechRecognitionDlg.h"
+#include "StreamingSpeechRecognitionDlg.h"
#ifdef _DEBUG
#define new DEBUG_NEW
diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp
index 7df684ce..1748b985 100644
--- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp
+++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp
@@ -1,10 +1,11 @@
// StreamingSpeechRecognitionDlg.cpp : implementation file
//
+// clang-format off
#include "pch.h"
#include "framework.h"
#include "afxdialogex.h"
-
+// clang-format on
#include "StreamingSpeechRecognitionDlg.h"
@@ -15,7 +16,6 @@
#include "StreamingSpeechRecognition.h"
-
#ifdef _DEBUG
#define new DEBUG_NEW
#endif
@@ -223,6 +223,7 @@ void CStreamingSpeechRecognitionDlg::InitMicrophone() {
// exit(-1);
AppendLineToMultilineEditCtrl("No default input device found!");
my_btn_.EnableWindow(FALSE);
+ return;
}
AppendLineToMultilineEditCtrl(std::string("Selected device ") +
Pa_GetDeviceInfo(default_device)->name);
@@ -309,7 +310,6 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() {
msg += "\r\n";
msg += "That's it!\r\n";
-
AppendLineToMultilineEditCtrl(msg);
return;
}
@@ -398,8 +398,6 @@ void CStreamingSpeechRecognitionDlg::AppendTextToEditCtrl(
// put the selection at the end of text
my_text_.SetSel(nLength, nLength);
// replace the selection
- CString str;
- str.Format(_T("%s"), s.c_str());
std::wstring wstr = Utf8ToUtf16(s);
diff --git a/mfc-examples/mfc-examples.sln b/mfc-examples/mfc-examples.sln
index 94a6a9ab..807d8a5b 100644
--- a/mfc-examples/mfc-examples.sln
+++ b/mfc-examples/mfc-examples.sln
@@ -1,10 +1,12 @@
Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 16
-VisualStudioVersion = 16.0.32630.194
+# Visual Studio Version 17
+VisualStudioVersion = 17.6.33829.357
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamingSpeechRecognition", "StreamingSpeechRecognition\StreamingSpeechRecognition.vcxproj", "{A79C2604-C33D-497C-9770-D34E118B77FE}"
EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingSpeechRecognition", "NonStreamingSpeechRecognition\NonStreamingSpeechRecognition.vcxproj", "{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
@@ -21,6 +23,14 @@ Global
{A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x64.Build.0 = Release|x64
{A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.ActiveCfg = Release|Win32
{A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.Build.0 = Release|Win32
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x64.ActiveCfg = Debug|x64
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x64.Build.0 = Debug|x64
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x86.ActiveCfg = Debug|Win32
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x86.Build.0 = Debug|Win32
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.ActiveCfg = Release|x64
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.Build.0 = Release|x64
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.ActiveCfg = Release|Win32
+ {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
index 36d77e0c..ba5f768f 100644
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -27,36 +27,38 @@ struct SherpaOnnxDisplay {
std::unique_ptr impl;
};
+#define SHERPA_ONNX_OR(x, y) (x ? x : y)
+
SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
const SherpaOnnxOnlineRecognizerConfig *config) {
sherpa_onnx::OnlineRecognizerConfig recognizer_config;
- recognizer_config.feat_config.sampling_rate = config->feat_config.sample_rate;
- recognizer_config.feat_config.feature_dim = config->feat_config.feature_dim;
+ recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
+ recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
recognizer_config.model_config.encoder_filename =
- config->model_config.encoder;
+ SHERPA_ONNX_OR(config->model_config.encoder, "");
recognizer_config.model_config.decoder_filename =
- config->model_config.decoder;
- recognizer_config.model_config.joiner_filename = config->model_config.joiner;
- recognizer_config.model_config.tokens = config->model_config.tokens;
- recognizer_config.model_config.num_threads = config->model_config.num_threads;
- recognizer_config.model_config.provider = config->model_config.provider;
- recognizer_config.model_config.debug = config->model_config.debug;
+ SHERPA_ONNX_OR(config->model_config.decoder, "");
+ recognizer_config.model_config.joiner_filename = SHERPA_ONNX_OR(config->model_config.joiner, "");
+ recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, "");
+ recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1);
+ recognizer_config.model_config.provider = SHERPA_ONNX_OR(config->model_config.provider, "cpu");
+ recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0);
- recognizer_config.decoding_method = config->decoding_method;
- recognizer_config.max_active_paths = config->max_active_paths;
+ recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
+ recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4);
- recognizer_config.enable_endpoint = config->enable_endpoint;
+ recognizer_config.enable_endpoint = SHERPA_ONNX_OR(config->enable_endpoint, 0);
recognizer_config.endpoint_config.rule1.min_trailing_silence =
- config->rule1_min_trailing_silence;
+ SHERPA_ONNX_OR(config->rule1_min_trailing_silence, 2.4);
recognizer_config.endpoint_config.rule2.min_trailing_silence =
- config->rule2_min_trailing_silence;
+ SHERPA_ONNX_OR(config->rule2_min_trailing_silence, 1.2);
recognizer_config.endpoint_config.rule3.min_utterance_length =
- config->rule3_min_utterance_length;
+ SHERPA_ONNX_OR(config->rule3_min_utterance_length, 20);
if (config->model_config.debug) {
fprintf(stderr, "%s\n", recognizer_config.ToString().c_str());
@@ -171,34 +173,34 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
const SherpaOnnxOfflineRecognizerConfig *config) {
sherpa_onnx::OfflineRecognizerConfig recognizer_config;
- recognizer_config.feat_config.sampling_rate = config->feat_config.sample_rate;
+ recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000);
- recognizer_config.feat_config.feature_dim = config->feat_config.feature_dim;
+ recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80);
recognizer_config.model_config.transducer.encoder_filename =
- config->model_config.transducer.encoder;
+ SHERPA_ONNX_OR(config->model_config.transducer.encoder, "");
recognizer_config.model_config.transducer.decoder_filename =
- config->model_config.transducer.decoder;
+ SHERPA_ONNX_OR(config->model_config.transducer.decoder, "");
recognizer_config.model_config.transducer.joiner_filename =
- config->model_config.transducer.joiner;
+ SHERPA_ONNX_OR(config->model_config.transducer.joiner,"");
recognizer_config.model_config.paraformer.model =
- config->model_config.paraformer.model;
+ SHERPA_ONNX_OR(config->model_config.paraformer.model, "");
recognizer_config.model_config.nemo_ctc.model =
- config->model_config.nemo_ctc.model;
+ SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, "");
- recognizer_config.model_config.tokens = config->model_config.tokens;
- recognizer_config.model_config.num_threads = config->model_config.num_threads;
- recognizer_config.model_config.debug = config->model_config.debug;
+ recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, "");
+ recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1);
+ recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0);
- recognizer_config.lm_config.model = config->lm_config.model;
- recognizer_config.lm_config.scale = config->lm_config.scale;
+ recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, "");
+ recognizer_config.lm_config.scale = SHERPA_ONNX_OR(config->lm_config.scale, 1.0);
- recognizer_config.decoding_method = config->decoding_method;
- recognizer_config.max_active_paths = config->max_active_paths;
+ recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search");
+ recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4);
if (config->model_config.debug) {
fprintf(stderr, "%s\n", recognizer_config.ToString().c_str());