diff --git a/mfc-examples/.gitignore b/mfc-examples/.gitignore new file mode 100644 index 00000000..49dc89d0 --- /dev/null +++ b/mfc-examples/.gitignore @@ -0,0 +1,399 @@ +# See https://github.com/github/gitignore/blob/main/VisualStudio.gitignore +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files +*.ncb +*.aps + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml diff --git a/mfc-examples/README.md b/mfc-examples/README.md new file mode 100644 index 00000000..45d48144 --- /dev/null +++ b/mfc-examples/README.md @@ -0,0 +1,53 @@ +# Speech recognition with Visual C++ MFC + +This directory contains examples showing how to use Next-gen Kaldi in MFC +for speech recognition. + +Caution: You need to use Windows and install Visual Studio in order to run it. +We use bash script below to demonstrate how to use it. Please change +the commands accordingly for Windows. + +## Streaming speech recognition + +```bash +mkdir -p $HOME/open-source +cd $HOME/open-source + +git clone https://github.com/k2-fsa/sherpa-onnx +cd sherpa-onnx +mkdir build + +cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=./install .. +cmake --build . --config Release --target install + +cd ../mfc-examples + +msbuild ./mfc-examples.sln /property:Configuration=Release /property:Platform=x64 +cp ../build/install/lib/*.dll ./x64/Release/ + +# now run the program + +./x64/Release/StreamingSpeechRecognition.exe +``` + +Note that we also need to download pre-trained models. Please +refer to https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/index.html +for a list of streaming models. + +We use the following model for demonstration. + +```bash +cd $HOME/open-source/sherpa-onnx/mfc-examples/x64/Release +wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx +wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/decoder-epoch-12-avg-4-chunk-16-left-128.onnx +wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/joiner-epoch-12-avg-4-chunk-16-left-128.onnx +wget https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt + +# now rename +mv encoder-epoch-12-avg-4-chunk-16-left-128.onnx encoder.onnx +mv decoder-epoch-12-avg-4-chunk-16-left-128.onnx decoder.onnx +mv joiner-epoch-12-avg-4-chunk-16-left-128.onnx joiner.onnx + +# Now run it! +./StreamingSpeechRecognition.exe +``` diff --git a/mfc-examples/StreamingSpeechRecognition/Resource.h b/mfc-examples/StreamingSpeechRecognition/Resource.h new file mode 100644 index 00000000..d4a280c9 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/Resource.h @@ -0,0 +1,18 @@ +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by StreamingSpeechRecognition.rc +// +#define IDD_STREAMINGSPEECHRECOGNITION_DIALOG 102 +#define IDR_MAINFRAME 128 +#define IDC_EDIT1 1000 + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 130 +#define _APS_NEXT_COMMAND_VALUE 32771 +#define _APS_NEXT_CONTROL_VALUE 1001 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp new file mode 100644 index 00000000..796c18c9 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.cpp @@ -0,0 +1,85 @@ + +// StreamingSpeechRecognition.cpp : Defines the class behaviors for the +// application. +// + +#include "StreamingSpeechRecognition.h" + +#include "StreamingSpeechRecognitionDlg.h" +#include "framework.h" +#include "pch.h" + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + +// CStreamingSpeechRecognitionApp + +BEGIN_MESSAGE_MAP(CStreamingSpeechRecognitionApp, CWinApp) +ON_COMMAND(ID_HELP, &CWinApp::OnHelp) +END_MESSAGE_MAP() + +// CStreamingSpeechRecognitionApp construction + +CStreamingSpeechRecognitionApp::CStreamingSpeechRecognitionApp() { + // TODO: add construction code here, + // Place all significant initialization in InitInstance +} + +// The one and only CStreamingSpeechRecognitionApp object + +CStreamingSpeechRecognitionApp theApp; + +// CStreamingSpeechRecognitionApp initialization + +BOOL CStreamingSpeechRecognitionApp::InitInstance() { + CWinApp::InitInstance(); + + // Create the shell manager, in case the dialog contains + // any shell tree view or shell list view controls. + CShellManager *pShellManager = new CShellManager; + + // Activate "Windows Native" visual manager for enabling themes in MFC + // controls + CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows)); + + // Standard initialization + // If you are not using these features and wish to reduce the size + // of your final executable, you should remove from the following + // the specific initialization routines you do not need + // Change the registry key under which our settings are stored + // TODO: You should modify this string to be something appropriate + // such as the name of your company or organization + SetRegistryKey(_T("Local AppWizard-Generated Applications")); + + CStreamingSpeechRecognitionDlg dlg; + m_pMainWnd = &dlg; + INT_PTR nResponse = dlg.DoModal(); + if (nResponse == IDOK) { + // TODO: Place code here to handle when the dialog is + // dismissed with OK + } else if (nResponse == IDCANCEL) { + // TODO: Place code here to handle when the dialog is + // dismissed with Cancel + } else if (nResponse == -1) { + TRACE(traceAppMsg, 0, + "Warning: dialog creation failed, so application is terminating " + "unexpectedly.\n"); + TRACE(traceAppMsg, 0, + "Warning: if you are using MFC controls on the dialog, you cannot " + "#define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n"); + } + + // Delete the shell manager created above. + if (pShellManager != nullptr) { + delete pShellManager; + } + +#if !defined(_AFXDLL) && !defined(_AFX_NO_MFC_CONTROLS_IN_DIALOGS) + ControlBarCleanUp(); +#endif + + // Since the dialog has been closed, return FALSE so that we exit the + // application, rather than start the application's message pump. + return FALSE; +} diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.h b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.h new file mode 100644 index 00000000..8822c7c4 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.h @@ -0,0 +1,31 @@ + +// StreamingSpeechRecognition.h : main header file for the PROJECT_NAME +// application +// + +#pragma once + +#ifndef __AFXWIN_H__ +#error "include 'pch.h' before including this file for PCH" +#endif + +#include "resource.h" // main symbols + +// CStreamingSpeechRecognitionApp: +// See StreamingSpeechRecognition.cpp for the implementation of this class +// + +class CStreamingSpeechRecognitionApp : public CWinApp { + public: + CStreamingSpeechRecognitionApp(); + + // Overrides + public: + virtual BOOL InitInstance(); + + // Implementation + + DECLARE_MESSAGE_MAP() +}; + +extern CStreamingSpeechRecognitionApp theApp; diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.rc b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.rc new file mode 100644 index 00000000..34b8a350 Binary files /dev/null and b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.rc differ diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj new file mode 100644 index 00000000..767dcbe7 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj @@ -0,0 +1,219 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {A79C2604-C33D-497C-9770-D34E118B77FE} + MFCProj + StreamingSpeechRecognition + 10.0 + + + + Application + true + v142 + Unicode + Static + + + Application + false + v142 + true + Unicode + Static + + + Application + true + v142 + Unicode + Static + + + Application + false + v142 + true + Unicode + Static + + + + + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + true + WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + _WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + true + true + WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + true + true + _WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + + + + + + + + + Create + Create + Create + Create + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj.filters b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj.filters new file mode 100644 index 00000000..693b4c86 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognition.vcxproj.filters @@ -0,0 +1,63 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + + + Resource Files + + + + + Resource Files + + + + + Resource Files + + + \ No newline at end of file diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp new file mode 100644 index 00000000..0b7f53ea --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.cpp @@ -0,0 +1,431 @@ + +// StreamingSpeechRecognitionDlg.cpp : implementation file +// + +#include "StreamingSpeechRecognitionDlg.h" + +#include +#include +#include +#include + +#include "StreamingSpeechRecognition.h" +#include "afxdialogex.h" +#include "framework.h" +#include "pch.h" + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + +Microphone::Microphone() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +Microphone::~Microphone() { + PaError err = Pa_Terminate(); + if (err != paNoError) { + fprintf(stderr, "portaudio error: %s\n", Pa_GetErrorText(err)); + exit(-2); + } +} + +// CStreamingSpeechRecognitionDlg dialog + +CStreamingSpeechRecognitionDlg::CStreamingSpeechRecognitionDlg( + CWnd *pParent /*=nullptr*/) + : CDialogEx(IDD_STREAMINGSPEECHRECOGNITION_DIALOG, pParent) { + m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); +} + +CStreamingSpeechRecognitionDlg::~CStreamingSpeechRecognitionDlg() { + if (recognizer_) { + DestroyOnlineRecognizer(recognizer_); + recognizer_ = nullptr; + } +} + +void CStreamingSpeechRecognitionDlg::DoDataExchange(CDataExchange *pDX) { + CDialogEx::DoDataExchange(pDX); + DDX_Control(pDX, IDOK, my_btn_); + DDX_Control(pDX, IDC_EDIT1, my_text_); +} + +BEGIN_MESSAGE_MAP(CStreamingSpeechRecognitionDlg, CDialogEx) +ON_WM_PAINT() +ON_WM_QUERYDRAGICON() +ON_BN_CLICKED(IDOK, &CStreamingSpeechRecognitionDlg::OnBnClickedOk) +END_MESSAGE_MAP() + +// CStreamingSpeechRecognitionDlg message handlers + +BOOL CStreamingSpeechRecognitionDlg::OnInitDialog() { + CDialogEx::OnInitDialog(); + + // Set the icon for this dialog. The framework does this automatically + // when the application's main window is not a dialog + SetIcon(m_hIcon, TRUE); // Set big icon + SetIcon(m_hIcon, FALSE); // Set small icon + + // TODO: Add extra initialization here + SetWindowText(_T("Real-time speech recogntion with Next-gen Kaldi")); + InitMicrophone(); + + return TRUE; // return TRUE unless you set the focus to a control +} + +// If you add a minimize button to your dialog, you will need the code below +// to draw the icon. For MFC applications using the document/view model, +// this is automatically done for you by the framework. + +void CStreamingSpeechRecognitionDlg::OnPaint() { + if (IsIconic()) { + CPaintDC dc(this); // device context for painting + + SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()), + 0); + + // Center icon in client rectangle + int cxIcon = GetSystemMetrics(SM_CXICON); + int cyIcon = GetSystemMetrics(SM_CYICON); + CRect rect; + GetClientRect(&rect); + int x = (rect.Width() - cxIcon + 1) / 2; + int y = (rect.Height() - cyIcon + 1) / 2; + + // Draw the icon + dc.DrawIcon(x, y, m_hIcon); + } else { + CDialogEx::OnPaint(); + } +} + +// The system calls this function to obtain the cursor to display while the user +// drags +// the minimized window. +HCURSOR CStreamingSpeechRecognitionDlg::OnQueryDragIcon() { + return static_cast(m_hIcon); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void *user_data) { + auto dlg = reinterpret_cast(user_data); + + auto stream = dlg->stream_; + if (stream) { + AcceptWaveform(stream, 16000, reinterpret_cast(input_buffer), + frames_per_buffer); + } + + return dlg->started_ ? paContinue : paComplete; +} + +void CStreamingSpeechRecognitionDlg::OnBnClickedOk() { + if (!recognizer_) { + AppendLineToMultilineEditCtrl("Creating recognizer..."); + InitRecognizer(); + if (!recognizer_) { + // failed to create the recognizer + return; + } + AppendLineToMultilineEditCtrl("Recognizer created!"); + } + + if (!started_) { + started_ = true; + + if (stream_) { + DestroyOnlineStream(stream_); + stream_ = nullptr; + } + + stream_ = CreateOnlineStream(recognizer_); + + PaStreamParameters param; + param.device = Pa_GetDefaultInputDevice(); + const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); + param.channelCount = 1; + param.sampleFormat = paFloat32; + param.suggestedLatency = info->defaultLowInputLatency; + param.hostApiSpecificStreamInfo = nullptr; + float sample_rate = 16000; + pa_stream_ = nullptr; + PaError err = + Pa_OpenStream(&pa_stream_, ¶m, nullptr, /* &outputParameters, */ + sample_rate, + 0, // frames per buffer + paClipOff, // we won't output out of range samples + // so don't bother clipping them + RecordCallback, this); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + + err = Pa_StartStream(pa_stream_); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + AppendLineToMultilineEditCtrl("Started! Please speak"); + my_btn_.SetWindowText(_T("Stop")); + + thread_ = new RecognizerThread(this); + thread_->CreateThread(CREATE_SUSPENDED); + thread_->m_bAutoDelete = false; // Let me delete it. + thread_->ResumeThread(); + } else { + started_ = false; + Pa_Sleep(200); // sleep for 200ms + if (pa_stream_) { + PaError err = Pa_CloseStream(pa_stream_); + if (err != paNoError) { + AppendLineToMultilineEditCtrl(std::string("PortAudio error: ") + + Pa_GetErrorText(err)); + my_btn_.EnableWindow(FALSE); + return; + } + } + pa_stream_ = nullptr; + + WaitForSingleObject(thread_->m_hThread, INFINITE); + delete thread_; + thread_ = nullptr; + + // AfxMessageBox("stopped", MB_OK); + my_btn_.SetWindowText(_T("Start")); + AppendLineToMultilineEditCtrl("Stopped"); + } +} + +void CStreamingSpeechRecognitionDlg::InitMicrophone() { + int default_device = Pa_GetDefaultInputDevice(); + int device_count = Pa_GetDeviceCount(); + if (default_device == paNoDevice) { + // CString str; + // str.Format(_T("No default input device found!")); + // AfxMessageBox(str, MB_OK | MB_ICONSTOP); + // exit(-1); + AppendLineToMultilineEditCtrl("No default input device found!"); + my_btn_.EnableWindow(FALSE); + } + AppendLineToMultilineEditCtrl(std::string("Selected device ") + + Pa_GetDeviceInfo(default_device)->name); +} + +bool CStreamingSpeechRecognitionDlg::Exists(const std::string &filename) { + std::ifstream is(filename); + return is.good(); +} + +void CStreamingSpeechRecognitionDlg::InitRecognizer() { + std::string encoder = "./encoder.onnx"; + std::string decoder = "./decoder.onnx"; + std::string joiner = "./joiner.onnx"; + std::string tokens = "./tokens.txt"; +#if 1 + std::string prefix = + "C:/Users/fangjun/source/repos/MFCApplication2/x64/Release"; + encoder = prefix + encoder; + decoder = prefix + decoder; + joiner = prefix + joiner; + tokens = prefix + tokens; +#endif + + bool is_ok = true; + if (!Exists(encoder)) { + std::string msg = encoder + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(decoder)) { + std::string msg = decoder + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(joiner)) { + std::string msg = joiner + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!Exists(tokens)) { + std::string msg = tokens + " does not exist!"; + AppendLineToMultilineEditCtrl(msg); + is_ok = false; + } + + if (!is_ok) { + my_btn_.EnableWindow(FALSE); + std::string msg = + "\r\nPlease go to " + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html " + "\r\n"; + msg += "to download an offline model."; + msg += + " You need to rename them to encoder.onnx, decoder.onnx, and " + "joiner.onnx correspoondingly"; + AppendLineToMultilineEditCtrl(msg); + return; + } + + SherpaOnnxOnlineRecognizerConfig config; + config.model_config.debug = 0; + config.model_config.num_threads = 2; + config.model_config.provider = "cpu"; + + config.decoding_method = "greedy_search"; + config.max_active_paths = 4; + + config.feat_config.sample_rate = 16000; + config.feat_config.feature_dim = 80; + + config.enable_endpoint = 1; + config.rule1_min_trailing_silence = 1.2f; + config.rule2_min_trailing_silence = 0.8f; + config.rule3_min_utterance_length = 300.0f; + + config.model_config.tokens = tokens.c_str(); + config.model_config.encoder = encoder.c_str(); + config.model_config.decoder = decoder.c_str(); + config.model_config.joiner = joiner.c_str(); + + recognizer_ = CreateOnlineRecognizer(&config); +} + +// see +// https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring +std::wstring Utf8ToUtf16(const std::string &utf8) { + std::vector unicode; + size_t i = 0; + while (i < utf8.size()) { + unsigned long uni; + size_t todo; + bool error = false; + unsigned char ch = utf8[i++]; + if (ch <= 0x7F) { + uni = ch; + todo = 0; + } else if (ch <= 0xBF) { + throw std::logic_error("not a UTF-8 string"); + } else if (ch <= 0xDF) { + uni = ch & 0x1F; + todo = 1; + } else if (ch <= 0xEF) { + uni = ch & 0x0F; + todo = 2; + } else if (ch <= 0xF7) { + uni = ch & 0x07; + todo = 3; + } else { + throw std::logic_error("not a UTF-8 string"); + } + for (size_t j = 0; j < todo; ++j) { + if (i == utf8.size()) throw std::logic_error("not a UTF-8 string"); + unsigned char ch = utf8[i++]; + if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string"); + uni <<= 6; + uni += ch & 0x3F; + } + if (uni >= 0xD800 && uni <= 0xDFFF) + throw std::logic_error("not a UTF-8 string"); + if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string"); + unicode.push_back(uni); + } + std::wstring utf16; + for (size_t i = 0; i < unicode.size(); ++i) { + unsigned long uni = unicode[i]; + if (uni <= 0xFFFF) { + utf16 += (wchar_t)uni; + } else { + uni -= 0x10000; + utf16 += (wchar_t)((uni >> 10) + 0xD800); + utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00); + } + } + return utf16; +} + +void CStreamingSpeechRecognitionDlg::AppendTextToEditCtrl( + const std::string &s) { + // get the initial text length + int nLength = my_text_.GetWindowTextLength(); + // put the selection at the end of text + my_text_.SetSel(nLength, nLength); + // replace the selection + CString str; + str.Format(_T("%s"), s.c_str()); + + std::wstring wstr = Utf8ToUtf16(s); + + // my_text_.ReplaceSel(wstr.c_str()); + my_text_.ReplaceSel(wstr.c_str()); +} + +void CStreamingSpeechRecognitionDlg::AppendLineToMultilineEditCtrl( + const std::string &s) { + AppendTextToEditCtrl("\r\n" + s); +} + +static std::string Cat(const std::vector &results, + const std::string &s) { + std::ostringstream os; + std::string sep; + + int i = 0; + for (i = 0; i != results.size(); ++i) { + os << sep << i << ": " << results[i]; + sep = "\r\n"; + } + + if (!s.empty()) { + os << sep << i << ": " << s; + } + return os.str(); +} + +int CStreamingSpeechRecognitionDlg::RunThread() { + std::vector results; + std::string last_text; + while (started_) { + while (IsOnlineStreamReady(recognizer_, stream_)) { + DecodeOnlineStream(recognizer_, stream_); + } + + auto r = GetOnlineStreamResult(recognizer_, stream_); + std::string text = r->text; + DestroyOnlineRecognizerResult(r); + if (!text.empty() && last_text != text) { + // CString str; + // str.Format(_T("%s"), Cat(results, text).c_str()); + auto str = Utf8ToUtf16(Cat(results, text).c_str()); + my_text_.SetWindowText(str.c_str()); + last_text = text; + } + int is_endpoint = IsEndpoint(recognizer_, stream_); + if (is_endpoint) { + Reset(recognizer_, stream_); + if (!text.empty()) { + results.push_back(std::move(text)); + } + } + + Pa_Sleep(100); // sleep for 100ms + } + + return 0; +} diff --git a/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.h b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.h new file mode 100644 index 00000000..0c624ba3 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/StreamingSpeechRecognitionDlg.h @@ -0,0 +1,80 @@ + +// StreamingSpeechRecognitionDlg.h : header file +// + +#pragma once + +#include + +#include "portaudio.h" +#include "sherpa-onnx/c-api/c-api.h" + +class Microphone { + public: + Microphone(); + ~Microphone(); +}; + +class RecognizerThread; + +// CStreamingSpeechRecognitionDlg dialog +class CStreamingSpeechRecognitionDlg : public CDialogEx { + // Construction + public: + CStreamingSpeechRecognitionDlg( + CWnd *pParent = nullptr); // standard constructor + ~CStreamingSpeechRecognitionDlg(); + +// Dialog Data +#ifdef AFX_DESIGN_TIME + enum { IDD = IDD_STREAMINGSPEECHRECOGNITION_DIALOG }; +#endif + + protected: + virtual void DoDataExchange(CDataExchange *pDX); // DDX/DDV support + + // Implementation + protected: + HICON m_hIcon; + + // Generated message map functions + virtual BOOL OnInitDialog(); + afx_msg void OnPaint(); + afx_msg HCURSOR OnQueryDragIcon(); + DECLARE_MESSAGE_MAP() + private: + Microphone mic_; + + SherpaOnnxOnlineRecognizer *recognizer_ = nullptr; + + PaStream *pa_stream_ = nullptr; + RecognizerThread *thread_; + CButton my_btn_; + CEdit my_text_; + + public: + bool started_ = false; + SherpaOnnxOnlineStream *stream_ = nullptr; + + public: + int RunThread(); + afx_msg void OnBnClickedOk(); + + private: + void AppendTextToEditCtrl(const std::string &s); + void AppendLineToMultilineEditCtrl(const std::string &s); + void InitMicrophone(); + + bool Exists(const std::string &filename); + void InitRecognizer(); +}; + +class RecognizerThread : public CWinThread { + public: + RecognizerThread(CStreamingSpeechRecognitionDlg *dlg) : dlg_(dlg) {} + virtual BOOL InitInstance() { return TRUE; } + virtual int Run() { return dlg_->RunThread(); } + + private: + CStreamingSpeechRecognitionDlg *dlg_; +}; \ No newline at end of file diff --git a/mfc-examples/StreamingSpeechRecognition/framework.h b/mfc-examples/StreamingSpeechRecognition/framework.h new file mode 100644 index 00000000..65e02c32 --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/framework.h @@ -0,0 +1,26 @@ +#pragma once + +#ifndef VC_EXTRALEAN +#define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers +#endif + +#include "targetver.h" + +#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be + // explicit + +// turns off MFC's hiding of some common and often safely ignored warning +// messages +#define _AFX_ALL_WARNINGS + +#include // MFC extensions +#include // MFC core and standard components + +#ifndef _AFX_NO_OLE_SUPPORT +#include // MFC support for Internet Explorer 4 Common Controls +#endif +#ifndef _AFX_NO_AFXCMN_SUPPORT +#include // MFC support for Windows Common Controls +#endif // _AFX_NO_AFXCMN_SUPPORT + +#include // MFC support for ribbons and control bars diff --git a/mfc-examples/StreamingSpeechRecognition/pch.cpp b/mfc-examples/StreamingSpeechRecognition/pch.cpp new file mode 100644 index 00000000..00df68aa --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/pch.cpp @@ -0,0 +1,6 @@ +// pch.cpp: source file corresponding to the pre-compiled header + +#include "pch.h" + +// When you are using pre-compiled headers, this source file is necessary for +// compilation to succeed. diff --git a/mfc-examples/StreamingSpeechRecognition/pch.h b/mfc-examples/StreamingSpeechRecognition/pch.h new file mode 100644 index 00000000..4e7f5afb --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/pch.h @@ -0,0 +1,15 @@ +// pch.h: This is a precompiled header file. +// Files listed below are compiled only once, improving build performance for +// future builds. This also affects IntelliSense performance, including code +// completion and many code browsing features. However, files listed here are +// ALL re-compiled if any one of them is updated between builds. Do not add +// files here that you will be updating frequently as this negates the +// performance advantage. + +#ifndef PCH_H +#define PCH_H + +// add headers that you want to pre-compile here +#include "framework.h" + +#endif // PCH_H diff --git a/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.ico b/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.ico new file mode 100644 index 00000000..d56fbcdf Binary files /dev/null and b/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.ico differ diff --git a/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.rc2 b/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.rc2 new file mode 100644 index 00000000..245f8c1a Binary files /dev/null and b/mfc-examples/StreamingSpeechRecognition/res/StreamingSpeechRecognition.rc2 differ diff --git a/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props b/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props new file mode 100644 index 00000000..c142360e --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/sherpa-onnx-deps.props @@ -0,0 +1,21 @@ + + + + + + ..\..\build + ..\..\build\install + sherpa-onnx-portaudio.lib;sherpa-onnx-c-api.lib;sherpa-onnx-core.lib + + + + + $(SherpaOnnxBuildDirectory)\_deps\portaudio-src\include;$(SherpaOnnxInstallDirectory)\include;%(AdditionalIncludeDirectories) + + + $(SherpaOnnxInstallDirectory)\lib;%(AdditionalLibraryDirectories) + $(SherpaOnnxLibraries) + + + + diff --git a/mfc-examples/StreamingSpeechRecognition/targetver.h b/mfc-examples/StreamingSpeechRecognition/targetver.h new file mode 100644 index 00000000..87fe2aec --- /dev/null +++ b/mfc-examples/StreamingSpeechRecognition/targetver.h @@ -0,0 +1,9 @@ +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, +// include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you wish +// to support before including SDKDDKVer.h. + +#include diff --git a/mfc-examples/mfc-examples.sln b/mfc-examples/mfc-examples.sln new file mode 100644 index 00000000..94a6a9ab --- /dev/null +++ b/mfc-examples/mfc-examples.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32630.194 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamingSpeechRecognition", "StreamingSpeechRecognition\StreamingSpeechRecognition.vcxproj", "{A79C2604-C33D-497C-9770-D34E118B77FE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {A79C2604-C33D-497C-9770-D34E118B77FE}.Debug|x64.ActiveCfg = Debug|x64 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Debug|x64.Build.0 = Debug|x64 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Debug|x86.ActiveCfg = Debug|Win32 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Debug|x86.Build.0 = Debug|Win32 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x64.ActiveCfg = Release|x64 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x64.Build.0 = Release|x64 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.ActiveCfg = Release|Win32 + {A79C2604-C33D-497C-9770-D34E118B77FE}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {C0A85719-CF8C-4BCD-BDF6-7C57EE651CBB} + EndGlobalSection +EndGlobal