diff --git a/.github/workflows/mfc.yaml b/.github/workflows/mfc.yaml index 44c65da4..cdf5f9c1 100644 --- a/.github/workflows/mfc.yaml +++ b/.github/workflows/mfc.yaml @@ -98,6 +98,7 @@ jobs: cd mfc-examples/$arch/Release cp StreamingSpeechRecognition.exe sherpa-onnx-streaming-${SHERPA_ONNX_VERSION}.exe + cp NonStreamingSpeechRecognition.exe sherpa-onnx-non-streaming-${SHERPA_ONNX_VERSION}.exe ls -lh - name: Upload artifact @@ -106,10 +107,24 @@ jobs: name: streaming-speech-recognition-${{ matrix.arch }} path: ./mfc-examples/${{ matrix.arch }}/Release/StreamingSpeechRecognition.exe - - name: Release pre-compiled binaries and libs for macOS + - name: Upload artifact + uses: actions/upload-artifact@v2 + with: + name: non-streaming-speech-recognition-${{ matrix.arch }} + path: ./mfc-examples/${{ matrix.arch }}/Release/NonStreamingSpeechRecognition.exe + + - name: Release pre-compiled binaries and libs for Windows ${{ matrix.arch }} if: env.RELEASE == 'true' uses: svenstaro/upload-release-action@v2 with: file_glob: true overwrite: true - file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx*.exe + file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx-streaming-*.exe + + - name: Release pre-compiled binaries and libs for Windows ${{ matrix.arch }} + if: env.RELEASE == 'true' + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./mfc-examples/${{ matrix.arch }}/Release/sherpa-onnx-non-streaming-*.exe diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 50bedd94..033d2ceb 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -113,7 +113,7 @@ function(download_onnxruntime) set(onnxruntime_URL "https://huggingface.co/csukuangfj/onnxruntime-libs/resolve/main/onnxruntime-win-x86-static-1.15.1.tar.bz2") set(onnxruntime_URL2 "") - set(onnxruntime_HASH "SHA256=a2b33a3e8a1f89cddf303f0a97a5a88f4202579c653cfb29158c8cf7da3734eb") + set(onnxruntime_HASH "SHA256=94d9a30976b5c4a5dff7508d00f141835916e5a36315d5f53be9b3edb85148b5") endif() if(SHERPA_ONNX_ENABLE_GPU) @@ -161,7 +161,7 @@ function(download_onnxruntime) set(onnxruntime_URL "https://huggingface.co/csukuangfj/onnxruntime-libs/resolve/main/onnxruntime-win-x64-static-1.15.1.tar.bz2") set(onnxruntime_URL2 "") - set(onnxruntime_HASH "SHA256=f5c19ac1fc6a61c78a231a41df10aede2586665ab397bdc3f007eb8d2c8d4a19") + set(onnxruntime_HASH "SHA256=c809a8510a89b8b37ae7d563c39229db22bac8fbefcbfe5c81a60b367d065b1b") endif() endif() # After downloading, it contains: diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp new file mode 100644 index 00000000..66a85c49 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.cpp @@ -0,0 +1,86 @@ + +// NonStreamingSpeechRecognition.cpp : Defines the class behaviors for the +// application. +// + +// clang-format off +#include "pch.h" +#include "framework.h" +#include "NonStreamingSpeechRecognitionDlg.h" +#include "NonStreamingSpeechRecognition.h" +// clang-format on + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + +// CNonStreamingSpeechRecognitionApp + +BEGIN_MESSAGE_MAP(CNonStreamingSpeechRecognitionApp, CWinApp) +ON_COMMAND(ID_HELP, &CWinApp::OnHelp) +END_MESSAGE_MAP() + +// CNonStreamingSpeechRecognitionApp construction + +CNonStreamingSpeechRecognitionApp::CNonStreamingSpeechRecognitionApp() { + // TODO: add construction code here, + // Place all significant initialization in InitInstance +} + +// The one and only CNonStreamingSpeechRecognitionApp object + +CNonStreamingSpeechRecognitionApp theApp; + +// CNonStreamingSpeechRecognitionApp initialization + +BOOL CNonStreamingSpeechRecognitionApp::InitInstance() { + CWinApp::InitInstance(); + + // Create the shell manager, in case the dialog contains + // any shell tree view or shell list view controls. + CShellManager *pShellManager = new CShellManager; + + // Activate "Windows Native" visual manager for enabling themes in MFC + // controls + CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows)); + + // Standard initialization + // If you are not using these features and wish to reduce the size + // of your final executable, you should remove from the following + // the specific initialization routines you do not need + // Change the registry key under which our settings are stored + // TODO: You should modify this string to be something appropriate + // such as the name of your company or organization + SetRegistryKey(_T("Local AppWizard-Generated Applications")); + + CNonStreamingSpeechRecognitionDlg dlg; + m_pMainWnd = &dlg; + INT_PTR nResponse = dlg.DoModal(); + if (nResponse == IDOK) { + // TODO: Place code here to handle when the dialog is + // dismissed with OK + } else if (nResponse == IDCANCEL) { + // TODO: Place code here to handle when the dialog is + // dismissed with Cancel + } else if (nResponse == -1) { + TRACE(traceAppMsg, 0, + "Warning: dialog creation failed, so application is terminating " + "unexpectedly.\n"); + TRACE(traceAppMsg, 0, + "Warning: if you are using MFC controls on the dialog, you cannot " + "#define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n"); + } + + // Delete the shell manager created above. + if (pShellManager != nullptr) { + delete pShellManager; + } + +#if !defined(_AFXDLL) && !defined(_AFX_NO_MFC_CONTROLS_IN_DIALOGS) + ControlBarCleanUp(); +#endif + + // Since the dialog has been closed, return FALSE so that we exit the + // application, rather than start the application's message pump. + return FALSE; +} diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h new file mode 100644 index 00000000..bb176491 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.h @@ -0,0 +1,31 @@ + +// NonStreamingSpeechRecognition.h : main header file for the PROJECT_NAME +// application +// + +#pragma once + +#ifndef __AFXWIN_H__ +#error "include 'pch.h' before including this file for PCH" +#endif + +#include "resource.h" // main symbols + +// CNonStreamingSpeechRecognitionApp: +// See NonStreamingSpeechRecognition.cpp for the implementation of this class +// + +class CNonStreamingSpeechRecognitionApp : public CWinApp { + public: + CNonStreamingSpeechRecognitionApp(); + + // Overrides + public: + virtual BOOL InitInstance(); + + // Implementation + + DECLARE_MESSAGE_MAP() +}; + +extern CNonStreamingSpeechRecognitionApp theApp; diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc new file mode 100644 index 00000000..7730ef76 Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.rc differ diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj new file mode 100644 index 00000000..b831c06f --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj @@ -0,0 +1,219 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D} + MFCProj + NonStreamingSpeechRecognition + 10.0 + + + + Application + true + v143 + Unicode + Static + + + Application + false + v143 + true + Unicode + Static + + + Application + true + v143 + Unicode + Static + + + Application + false + v143 + true + Unicode + Static + + + + + + + + + + + + + + + + + + + + + + + + + false + + + true + + + true + + + false + + + + Use + Level3 + true + true + true + _WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + _WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + true + true + WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + + + + + + + + + + + Create + Create + Create + Create + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters new file mode 100644 index 00000000..32434b5a --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognition.vcxproj.filters @@ -0,0 +1,63 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + + + Resource Files + + + + + Resource Files + + + + + Resource Files + + + \ No newline at end of file diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp new file mode 100644 index 00000000..52b699c0 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.cpp @@ -0,0 +1,473 @@ + +// NonStreamingSpeechRecognitionDlg.cpp : implementation file +// + +// clang-format off +#include "pch.h" +#include "framework.h" +#include "afxdialogex.h" +#include "NonStreamingSpeechRecognition.h" +#include "NonStreamingSpeechRecognitionDlg.h" +// clang-format on + +#include +#include +#include +#include + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + +Microphone::Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +Microphone::~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +// see +// https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring +static std::wstring Utf8ToUtf16(const std::string &utf8) { + std::vector unicode; + size_t i = 0; + while (i < utf8.size()) { + unsigned long uni; + size_t todo; + bool error = false; + unsigned char ch = utf8[i++]; + if (ch <= 0x7F) { + uni = ch; + todo = 0; + } else if (ch <= 0xBF) { + throw std::logic_error("not a UTF-8 string"); + } else if (ch <= 0xDF) { + uni = ch & 0x1F; + todo = 1; + } else if (ch <= 0xEF) { + uni = ch & 0x0F; + todo = 2; + } else if (ch <= 0xF7) { + uni = ch & 0x07; + todo = 3; + } else { + throw std::logic_error("not a UTF-8 string"); + } + for (size_t j = 0; j < todo; ++j) { + if (i == utf8.size()) throw std::logic_error("not a UTF-8 string"); + unsigned char ch = utf8[i++]; + if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string"); + uni <<= 6; + uni += ch & 0x3F; + } + if (uni >= 0xD800 && uni <= 0xDFFF) + throw std::logic_error("not a UTF-8 string"); + if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string"); + unicode.push_back(uni); + } + std::wstring utf16; + for (size_t i = 0; i < unicode.size(); ++i) { + unsigned long uni = unicode[i]; + if (uni <= 0xFFFF) { + utf16 += (wchar_t)uni; + } else { + uni -= 0x10000; + utf16 += (wchar_t)((uni >> 10) + 0xD800); + utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00); + } + } + return utf16; +} + +static std::string Cat(const std::vector &results) { + std::ostringstream os; + std::string sep; + + int i = 0; + for (i = 0; i != results.size(); ++i) { + os << sep << i << ": " << results[i]; + sep = "\r\n"; + } + + return os.str(); +} + +// CNonStreamingSpeechRecognitionDlg dialog + +CNonStreamingSpeechRecognitionDlg::CNonStreamingSpeechRecognitionDlg( + CWnd *pParent /*=nullptr*/) + : CDialogEx(IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG, pParent) { + m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); +} + +CNonStreamingSpeechRecognitionDlg::~CNonStreamingSpeechRecognitionDlg() { + if (recognizer_) { + DestroyOfflineRecognizer(recognizer_); + recognizer_ = nullptr; + } +} + +void CNonStreamingSpeechRecognitionDlg::DoDataExchange(CDataExchange *pDX) { + CDialogEx::DoDataExchange(pDX); + DDX_Control(pDX, IDC_EDIT1, my_text_); + DDX_Control(pDX, IDOK, my_btn_); +} + +BEGIN_MESSAGE_MAP(CNonStreamingSpeechRecognitionDlg, CDialogEx) +ON_WM_PAINT() +ON_WM_QUERYDRAGICON() +ON_BN_CLICKED(IDOK, &CNonStreamingSpeechRecognitionDlg::OnBnClickedOk) +END_MESSAGE_MAP() + +// CNonStreamingSpeechRecognitionDlg message handlers + +BOOL CNonStreamingSpeechRecognitionDlg::OnInitDialog() { + CDialogEx::OnInitDialog(); + + // Set the icon for this dialog. The framework does this automatically + // when the application's main window is not a dialog + SetIcon(m_hIcon, TRUE); // Set big icon + SetIcon(m_hIcon, FALSE); // Set small icon + + // TODO: Add extra initialization here + InitMicrophone(); + + return TRUE; // return TRUE unless you set the focus to a control +} + +// If you add a minimize button to your dialog, you will need the code below +// to draw the icon. For MFC applications using the document/view model, +// this is automatically done for you by the framework. + +void CNonStreamingSpeechRecognitionDlg::OnPaint() { + if (IsIconic()) { + CPaintDC dc(this); // device context for painting + + SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()), + 0); + + // Center icon in client rectangle + int cxIcon = GetSystemMetrics(SM_CXICON); + int cyIcon = GetSystemMetrics(SM_CYICON); + CRect rect; + GetClientRect(&rect); + int x = (rect.Width() - cxIcon + 1) / 2; + int y = (rect.Height() - cyIcon + 1) / 2; + + // Draw the icon + dc.DrawIcon(x, y, m_hIcon); + } else { + CDialogEx::OnPaint(); + } +} + +// The system calls this function to obtain the cursor to display while the user +// drags +// the minimized window. +HCURSOR CNonStreamingSpeechRecognitionDlg::OnQueryDragIcon() { + return static_cast(m_hIcon); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + auto dlg = reinterpret_cast(user_data); + auto begin = reinterpret_cast(input_buffer); + auto end = begin + frames_per_buffer; + dlg->samples_.insert(dlg->samples_.end(), begin, end); + + return dlg->started_ ? paContinue : paComplete; +} + +void CNonStreamingSpeechRecognitionDlg::OnBnClickedOk() { + if (!recognizer_) { + AppendLineToMultilineEditCtrl("Creating recognizer..."); + AppendLineToMultilineEditCtrl("It will take several seconds. Please wait"); + InitRecognizer(); + if (!recognizer_) { + // failed to create the recognizer + return; + } + AppendLineToMultilineEditCtrl("Recognizer created!"); + } + + if (!started_) { + samples_.clear(); + started_ = true; + + PaStreamParameters param; + param.device = Pa_GetDefaultInputDevice(); + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + param.channelCount = 1; + param.sampleFormat = paFloat32; + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + float sample_rate = config_.feat_config.sample_rate; + pa_stream_ = nullptr; + PaError err = + Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */ + sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, this); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + + err = Pa_StartStream(pa_stream_); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + AppendLineToMultilineEditCtrl( + "\r\nStarted! Please speak and click stop.\r\n"); + my_btn_.SetWindowText(_T("Stop")); + + } else { + started_ = false; + + Pa_Sleep(200); // sleep for 200ms + if (pa_stream_) { + PaError err = Pa_CloseStream(pa_stream_); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + } + pa_stream_ = nullptr; + + SherpaOnnxOfflineStream *stream = CreateOfflineStream(recognizer_); + + AcceptWaveformOffline(stream, config_.feat_config.sample_rate, + samples_.data(), samples_.size()); + DecodeOfflineStream(recognizer_, stream); + SherpaOnnxOfflineRecognizerResult *r = GetOfflineStreamResult(stream); + results_.emplace_back(r->text); + + auto str = Utf8ToUtf16(Cat(results_).c_str()); + my_text_.SetWindowText(str.c_str()); + my_text_.SetFocus(); + my_text_.SetSel(-1); + + DestroyOfflineRecognizerResult(r); + + DestroyOfflineStream(stream); + // AfxMessageBox("Stopped", MB_OK); + my_btn_.SetWindowText(_T("Start")); + AppendLineToMultilineEditCtrl("\r\nStopped. Please click start and speak"); + } +} + +void CNonStreamingSpeechRecognitionDlg::InitMicrophone() { + int default_device = Pa_GetDefaultInputDevice(); + int device_count = Pa_GetDeviceCount(); + if (default_device == paNoDevice) { + // CString str; + // str.Format(_T("No default input device found!")); + // AfxMessageBox(str, MB_OK | MB_ICONSTOP); + // exit(-1); + AppendLineToMultilineEditCtrl("No default input device found!"); + my_btn_.EnableWindow(FALSE); + return; + } + AppendLineToMultilineEditCtrl(std::string("Selected device ") + + Pa_GetDeviceInfo(default_device)->name); +} + +bool CNonStreamingSpeechRecognitionDlg::Exists(const std::string &filename) { + std::ifstream is(filename); + return is.good(); +} + +void CNonStreamingSpeechRecognitionDlg::ShowInitRecognizerHelpMessage() { + my_btn_.EnableWindow(FALSE); + std::string msg = + "\r\nPlease go to\r\n" + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " + "\r\n"; + msg += "to download a non-streaming model, i.e., an offline model.\r\n"; + msg += + "You need to rename them to encoder.onnx, decoder.onnx, and " + "joiner.onnx correspoondingly.\r\n\r\n"; + msg += "It supports both transducer models and paraformer models.\r\n\r\n"; + msg += + "We give two examples below to show you how to download models\r\n\r\n"; + msg += "(1) Transducer\r\n\r\n"; + msg += + "We use " + "https://huggingface.co/pkufool/" + "icefall-asr-zipformer-wenetspeech-20230615 below\r\n"; + msg += + "wget " + "https://huggingface.co/pkufool/" + "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/" + "encoder-epoch-12-avg-4.onnx\r\n"; + msg += + "wget " + "https://huggingface.co/pkufool/" + "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/" + "decoder-epoch-12-avg-4.onnx\r\n"; + msg += + "wget " + "https://huggingface.co/pkufool/" + "icefall-asr-zipformer-wenetspeech-20230615/resolve/main/exp/" + "joiner-epoch-12-avg-4.onnx\r\n"; + msg += "\r\n Now rename them\r\n"; + msg += "mv encoder-epoch-12-avg-4.onnx encoder.onnx\r\n"; + msg += "mv decoder-epoch-12-avg-4.onnx decoder.onnx\r\n"; + msg += "mv joiner-epoch-12-avg-4.onnx joiner.onnx\r\n\r\n"; + msg += "(2) Paraformer\r\n\r\n"; + msg += + "wget " + "https://huggingface.co/csukuangfj/" + "sherpa-onnx-paraformer-zh-2023-03-28/resolve/main/model.onnx\r\n"; + msg += + "wget " + "https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28/" + "resolve/main/tokens.txt\r\n\r\n"; + msg += "\r\n Now rename them\r\n"; + msg += "mv model.onnx paraformer.onnx\r\n"; + msg += "\r\n"; + msg += "That's it!\r\n"; + + AppendLineToMultilineEditCtrl(msg); +} + +void CNonStreamingSpeechRecognitionDlg::InitParaformer() { + std::string paraformer = "./paraformer.onnx"; + std::string tokens = "./tokens.txt"; + + bool is_ok = true; + + if (Exists("./paraformer.int8.onnx")) { + paraformer = "./paraformer.int8.onnx"; + } else if (!Exists(paraformer)) { + std::string msg = paraformer + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(tokens)) { + std::string msg = tokens + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!is_ok) { + ShowInitRecognizerHelpMessage(); + return; + } + + memset(&config_, 0, sizeof(config_)); + + config_.feat_config.sample_rate = 16000; + config_.feat_config.feature_dim = 80; + + config_.model_config.paraformer.model = paraformer.c_str(); + config_.model_config.tokens = tokens.c_str(); + config_.model_config.num_threads = 1; + config_.model_config.debug = 1; + + config_.decoding_method = "greedy_search"; + config_.max_active_paths = 4; + + recognizer_ = CreateOfflineRecognizer(&config_); +} + +void CNonStreamingSpeechRecognitionDlg::InitRecognizer() { + if (Exists("./paraformer.onnx") || Exists("./paraformer.int8.onnx")) { + InitParaformer(); + return; + } + + // assume it is transducer + + std::string encoder = "./encoder.onnx"; + std::string decoder = "./decoder.onnx"; + std::string joiner = "./joiner.onnx"; + std::string tokens = "./tokens.txt"; + + bool is_ok = true; + if (!Exists(encoder)) { + std::string msg = encoder + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(decoder)) { + std::string msg = decoder + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(joiner)) { + std::string msg = joiner + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(tokens)) { + std::string msg = tokens + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!is_ok) { + ShowInitRecognizerHelpMessage(); + return; + } + memset(&config_, 0, sizeof(config_)); + + config_.feat_config.sample_rate = 16000; + config_.feat_config.feature_dim = 80; + + config_.model_config.transducer.encoder = encoder.c_str(); + config_.model_config.transducer.decoder = decoder.c_str(); + config_.model_config.transducer.joiner = joiner.c_str(); + config_.model_config.tokens = tokens.c_str(); + config_.model_config.num_threads = 1; + config_.model_config.debug = 0; + + config_.decoding_method = "greedy_search"; + config_.max_active_paths = 4; + + recognizer_ = CreateOfflineRecognizer(&config_); +} + +void CNonStreamingSpeechRecognitionDlg::AppendTextToEditCtrl( + const std::string &s) { + // get the initial text length + int nLength = my_text_.GetWindowTextLength(); + // put the selection at the end of text + my_text_.SetSel(nLength, nLength); + // replace the selection + + std::wstring wstr = Utf8ToUtf16(s); + + my_text_.ReplaceSel(wstr.c_str()); +} + +void CNonStreamingSpeechRecognitionDlg::AppendLineToMultilineEditCtrl( + const std::string &s) { + AppendTextToEditCtrl("\r\n" + s); +} diff --git a/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h new file mode 100644 index 00000000..e364bc58 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/NonStreamingSpeechRecognitionDlg.h @@ -0,0 +1,73 @@ + +// NonStreamingSpeechRecognitionDlg.h : header file +// + +#pragma once + +#include +#include + +#include "portaudio.h" +#include "sherpa-onnx/c-api/c-api.h" + +class Microphone { + public: + Microphone(); + ~Microphone(); +}; + +// CNonStreamingSpeechRecognitionDlg dialog +class CNonStreamingSpeechRecognitionDlg : public CDialogEx { + // Construction + public: + CNonStreamingSpeechRecognitionDlg( + CWnd *pParent = nullptr); // standard constructor + ~CNonStreamingSpeechRecognitionDlg(); + +// Dialog Data +#ifdef AFX_DESIGN_TIME + enum { IDD = IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG }; +#endif + + protected: + virtual void DoDataExchange(CDataExchange *pDX); // DDX/DDV support + + // Implementation + protected: + HICON m_hIcon; + + // Generated message map functions + virtual BOOL OnInitDialog(); + afx_msg void OnPaint(); + afx_msg HCURSOR OnQueryDragIcon(); + DECLARE_MESSAGE_MAP() + public: + afx_msg void OnBnClickedOk(); + int RunThread(); + + private: + Microphone mic_; + + SherpaOnnxOfflineRecognizer *recognizer_ = nullptr; + SherpaOnnxOfflineRecognizerConfig config_; + + PaStream *pa_stream_ = nullptr; + CButton my_btn_; + CEdit my_text_; + std::vector results_; + + public: + bool started_ = false; + std::vector samples_; + + private: + void AppendTextToEditCtrl(const std::string &s); + void AppendLineToMultilineEditCtrl(const std::string &s); + void InitMicrophone(); + + bool Exists(const std::string &filename); + void InitRecognizer(); + + void InitParaformer(); + void ShowInitRecognizerHelpMessage(); +}; diff --git a/mfc-examples/NonStreamingSpeechRecognition/Resource.h b/mfc-examples/NonStreamingSpeechRecognition/Resource.h new file mode 100644 index 00000000..69eeecfa --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/Resource.h @@ -0,0 +1,18 @@ +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by NonStreamingSpeechRecognition.rc +// +#define IDD_NONSTREAMINGSPEECHRECOGNITION_DIALOG 102 +#define IDR_MAINFRAME 128 +#define IDC_EDIT1 1000 + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 130 +#define _APS_NEXT_COMMAND_VALUE 32771 +#define _APS_NEXT_CONTROL_VALUE 1001 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/mfc-examples/NonStreamingSpeechRecognition/framework.h b/mfc-examples/NonStreamingSpeechRecognition/framework.h new file mode 100644 index 00000000..65e02c32 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/framework.h @@ -0,0 +1,26 @@ +#pragma once + +#ifndef VC_EXTRALEAN +#define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers +#endif + +#include "targetver.h" + +#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be + // explicit + +// turns off MFC's hiding of some common and often safely ignored warning +// messages +#define _AFX_ALL_WARNINGS + +#include // MFC extensions +#include // MFC core and standard components + +#ifndef _AFX_NO_OLE_SUPPORT +#include // MFC support for Internet Explorer 4 Common Controls +#endif +#ifndef _AFX_NO_AFXCMN_SUPPORT +#include // MFC support for Windows Common Controls +#endif // _AFX_NO_AFXCMN_SUPPORT + +#include // MFC support for ribbons and control bars diff --git a/mfc-examples/NonStreamingSpeechRecognition/pch.cpp b/mfc-examples/NonStreamingSpeechRecognition/pch.cpp new file mode 100644 index 00000000..00df68aa --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/pch.cpp @@ -0,0 +1,6 @@ +// pch.cpp: source file corresponding to the pre-compiled header + +#include "pch.h" + +// When you are using pre-compiled headers, this source file is necessary for +// compilation to succeed. diff --git a/mfc-examples/NonStreamingSpeechRecognition/pch.h b/mfc-examples/NonStreamingSpeechRecognition/pch.h new file mode 100644 index 00000000..4e7f5afb --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/pch.h @@ -0,0 +1,15 @@ +// pch.h: This is a precompiled header file. +// Files listed below are compiled only once, improving build performance for +// future builds. This also affects IntelliSense performance, including code +// completion and many code browsing features. However, files listed here are +// ALL re-compiled if any one of them is updated between builds. Do not add +// files here that you will be updating frequently as this negates the +// performance advantage. + +#ifndef PCH_H +#define PCH_H + +// add headers that you want to pre-compile here +#include "framework.h" + +#endif // PCH_H diff --git a/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico new file mode 100644 index 00000000..d56fbcdf Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.ico differ diff --git a/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2 b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2 new file mode 100644 index 00000000..629a3829 Binary files /dev/null and b/mfc-examples/NonStreamingSpeechRecognition/res/NonStreamingSpeechRecognition.rc2 differ diff --git a/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props new file mode 100644 index 00000000..f0e609d3 --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/sherpa-onnx-deps.props @@ -0,0 +1,50 @@ + + + + + + ..\..\build + ..\..\build\install + + sherpa-onnx-portaudio_static.lib; + sherpa-onnx-c-api.lib; + sherpa-onnx-core.lib; + kaldi-native-fbank-core.lib; + absl_base.lib; + absl_city.lib; + absl_hash.lib; + absl_low_level_hash.lib; + absl_raw_hash_set.lib; + absl_raw_logging_internal.lib; + absl_throw_delegate.lib; + clog.lib; + cpuinfo.lib; + flatbuffers.lib; + libprotobuf-lite.lib; + onnx.lib; + onnx_proto.lib; + onnxruntime_common.lib; + onnxruntime_flatbuffers.lib; + onnxruntime_framework.lib; + onnxruntime_graph.lib; + onnxruntime_mlas.lib; + onnxruntime_optimizer.lib; + onnxruntime_providers.lib; + onnxruntime_session.lib; + onnxruntime_util.lib; + re2.lib; + + + + + + $(SherpaOnnxBuildDirectory)\_deps\portaudio-src\include; + $(SherpaOnnxInstallDirectory)\include;%(AdditionalIncludeDirectories) + + + $(SherpaOnnxInstallDirectory)\lib;%(AdditionalLibraryDirectories) + $(SherpaOnnxLibraries); + + + + diff --git a/mfc-examples/NonStreamingSpeechRecognition/targetver.h b/mfc-examples/NonStreamingSpeechRecognition/targetver.h new file mode 100644 index 00000000..87fe2aec --- /dev/null +++ b/mfc-examples/NonStreamingSpeechRecognition/targetver.h @@ -0,0 +1,9 @@ +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, +// include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you wish +// to support before including SDKDDKVer.h. + +#include diff --git a/mfc-examples/README.md b/mfc-examples/README.md index 6d4ada56..8e35cd29 100644 --- a/mfc-examples/README.md +++ b/mfc-examples/README.md @@ -3,11 +3,19 @@ This directory contains examples showing how to use Next-gen Kaldi in MFC for speech recognition. -Caution: You need to use Windows and install Visual Studio in order to run it. +Caution: You need to use Windows and install Visual Studio 2022 in order to +compile it. + +Hint: If you don't want to install Visual Studio, you can find below +about how to download pre-compiled `exe`. + We use bash script below to demonstrate how to use it. Please change the commands accordingly for Windows. -## Streaming speech recognition +## How to compile + + +First, we need to compile sherpa-onnx: ```bash mkdir -p $HOME/open-source @@ -19,7 +27,6 @@ mkdir build cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=./install .. cmake --build . --config Release --target install - cd ../mfc-examples msbuild ./mfc-examples.sln /property:Configuration=Release /property:Platform=x64 @@ -27,26 +34,13 @@ msbuild ./mfc-examples.sln /property:Configuration=Release /property:Platform=x6 # now run the program ./x64/Release/StreamingSpeechRecognition.exe +./x64/Release/NonStreamingSpeechRecognition.exe ``` -Note that we also need to download pre-trained models. Please -refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html -for a list of streaming models. +If you don't want to compile the project by yourself, you can download +pre-compiled `exe` from https://github.com/k2-fsa/sherpa-onnx/releases -We use the following model for demonstration. +For instance, you can use the following addresses: -```bash -cd $HOME/open-source/sherpa-onnx/mfc-examples/x64/Release -wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx -wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx -wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx -wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt - -# now rename -mv encoder-epoch-12-avg-4-chunk-16-left-128.onnx encoder.onnx -mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx -mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx - -# Now run it! -./StreamingSpeechRecognition.exe -``` + - https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.5.1/sherpa-onnx-streaming-v1.5.1.exe + - https://github.com/k2-fsa/sherpa-onnx/releases/download/v1.5.1/sherpa-onnx-non-streaming-v1.5.1.exe diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp index 3d81cf2f..e151866b 100644 --- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp @@ -3,12 +3,14 @@ // application. // +// clang-format off #include "pch.h" #include "framework.h" +// clang-format on #include "StreamingSpeechRecognition.h" -#include "StreamingSpeechRecognitionDlg.h" +#include "StreamingSpeechRecognitionDlg.h" #ifdef _DEBUG #define new DEBUG_NEW diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp index 7df684ce..1748b985 100644 --- a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp @@ -1,10 +1,11 @@ // StreamingSpeechRecognitionDlg.cpp : implementation file // +// clang-format off #include "pch.h" #include "framework.h" #include "afxdialogex.h" - +// clang-format on #include "StreamingSpeechRecognitionDlg.h" @@ -15,7 +16,6 @@ #include "StreamingSpeechRecognition.h" - #ifdef _DEBUG #define new DEBUG_NEW #endif @@ -223,6 +223,7 @@ void CStreamingSpeechRecognitionDlg::InitMicrophone() { // exit(-1); AppendLineToMultilineEditCtrl("No default input device found!"); my_btn_.EnableWindow(FALSE); + return; } AppendLineToMultilineEditCtrl(std::string("Selected device ") + Pa_GetDeviceInfo(default_device)->name); @@ -309,7 +310,6 @@ void CStreamingSpeechRecognitionDlg::InitRecognizer() { msg += "\r\n"; msg += "That's it!\r\n"; - AppendLineToMultilineEditCtrl(msg); return; } @@ -398,8 +398,6 @@ void CStreamingSpeechRecognitionDlg::AppendTextToEditCtrl( // put the selection at the end of text my_text_.SetSel(nLength, nLength); // replace the selection - CString str; - str.Format(_T("%s"), s.c_str()); std::wstring wstr = Utf8ToUtf16(s); diff --git a/mfc-examples/mfc-examples.sln b/mfc-examples/mfc-examples.sln index 94a6a9ab..807d8a5b 100644 --- a/mfc-examples/mfc-examples.sln +++ b/mfc-examples/mfc-examples.sln @@ -1,10 +1,12 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.32630.194 +# Visual Studio Version 17 +VisualStudioVersion = 17.6.33829.357 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamingSpeechRecognition", "StreamingSpeechRecognition\StreamingSpeechRecognition.vcxproj", "{A79C2604-C33D-497C-9770-D34E118B77FE}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingSpeechRecognition", "NonStreamingSpeechRecognition\NonStreamingSpeechRecognition.vcxproj", "{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -21,6 +23,14 @@ Global {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x64.Build.0 = Release|x64 {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.ActiveCfg = Release|Win32 {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.Build.0 = Release|Win32 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x64.ActiveCfg = Debug|x64 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x64.Build.0 = Debug|x64 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x86.ActiveCfg = Debug|Win32 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Debug|x86.Build.0 = Debug|Win32 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.ActiveCfg = Release|x64 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.Build.0 = Release|x64 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.ActiveCfg = Release|Win32 + {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 36d77e0c..ba5f768f 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -27,36 +27,38 @@ struct SherpaOnnxDisplay { std::unique_ptr impl; }; +#define SHERPA_ONNX_OR(x, y) (x ? x : y) + SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( const SherpaOnnxOnlineRecognizerConfig *config) { sherpa_onnx::OnlineRecognizerConfig recognizer_config; - recognizer_config.feat_config.sampling_rate = config->feat_config.sample_rate; - recognizer_config.feat_config.feature_dim = config->feat_config.feature_dim; + recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000); + recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80); recognizer_config.model_config.encoder_filename = - config->model_config.encoder; + SHERPA_ONNX_OR(config->model_config.encoder, ""); recognizer_config.model_config.decoder_filename = - config->model_config.decoder; - recognizer_config.model_config.joiner_filename = config->model_config.joiner; - recognizer_config.model_config.tokens = config->model_config.tokens; - recognizer_config.model_config.num_threads = config->model_config.num_threads; - recognizer_config.model_config.provider = config->model_config.provider; - recognizer_config.model_config.debug = config->model_config.debug; + SHERPA_ONNX_OR(config->model_config.decoder, ""); + recognizer_config.model_config.joiner_filename = SHERPA_ONNX_OR(config->model_config.joiner, ""); + recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, ""); + recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1); + recognizer_config.model_config.provider = SHERPA_ONNX_OR(config->model_config.provider, "cpu"); + recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0); - recognizer_config.decoding_method = config->decoding_method; - recognizer_config.max_active_paths = config->max_active_paths; + recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); + recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4); - recognizer_config.enable_endpoint = config->enable_endpoint; + recognizer_config.enable_endpoint = SHERPA_ONNX_OR(config->enable_endpoint, 0); recognizer_config.endpoint_config.rule1.min_trailing_silence = - config->rule1_min_trailing_silence; + SHERPA_ONNX_OR(config->rule1_min_trailing_silence, 2.4); recognizer_config.endpoint_config.rule2.min_trailing_silence = - config->rule2_min_trailing_silence; + SHERPA_ONNX_OR(config->rule2_min_trailing_silence, 1.2); recognizer_config.endpoint_config.rule3.min_utterance_length = - config->rule3_min_utterance_length; + SHERPA_ONNX_OR(config->rule3_min_utterance_length, 20); if (config->model_config.debug) { fprintf(stderr, "%s\n", recognizer_config.ToString().c_str()); @@ -171,34 +173,34 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( const SherpaOnnxOfflineRecognizerConfig *config) { sherpa_onnx::OfflineRecognizerConfig recognizer_config; - recognizer_config.feat_config.sampling_rate = config->feat_config.sample_rate; + recognizer_config.feat_config.sampling_rate = SHERPA_ONNX_OR(config->feat_config.sample_rate, 16000); - recognizer_config.feat_config.feature_dim = config->feat_config.feature_dim; + recognizer_config.feat_config.feature_dim = SHERPA_ONNX_OR(config->feat_config.feature_dim, 80); recognizer_config.model_config.transducer.encoder_filename = - config->model_config.transducer.encoder; + SHERPA_ONNX_OR(config->model_config.transducer.encoder, ""); recognizer_config.model_config.transducer.decoder_filename = - config->model_config.transducer.decoder; + SHERPA_ONNX_OR(config->model_config.transducer.decoder, ""); recognizer_config.model_config.transducer.joiner_filename = - config->model_config.transducer.joiner; + SHERPA_ONNX_OR(config->model_config.transducer.joiner,""); recognizer_config.model_config.paraformer.model = - config->model_config.paraformer.model; + SHERPA_ONNX_OR(config->model_config.paraformer.model, ""); recognizer_config.model_config.nemo_ctc.model = - config->model_config.nemo_ctc.model; + SHERPA_ONNX_OR(config->model_config.nemo_ctc.model, ""); - recognizer_config.model_config.tokens = config->model_config.tokens; - recognizer_config.model_config.num_threads = config->model_config.num_threads; - recognizer_config.model_config.debug = config->model_config.debug; + recognizer_config.model_config.tokens = SHERPA_ONNX_OR(config->model_config.tokens, ""); + recognizer_config.model_config.num_threads = SHERPA_ONNX_OR(config->model_config.num_threads, 1); + recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0); - recognizer_config.lm_config.model = config->lm_config.model; - recognizer_config.lm_config.scale = config->lm_config.scale; + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); + recognizer_config.lm_config.scale = SHERPA_ONNX_OR(config->lm_config.scale, 1.0); - recognizer_config.decoding_method = config->decoding_method; - recognizer_config.max_active_paths = config->max_active_paths; + recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); + recognizer_config.max_active_paths = SHERPA_ONNX_OR(config->max_active_paths, 4); if (config->model_config.debug) { fprintf(stderr, "%s\n", recognizer_config.ToString().c_str());