diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index bdfd6d37..87865464 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -20,6 +20,38 @@ tar xvf $name rm $name ls -lh $repo python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py + +if [[ $(uname) == Linux ]]; then + # It needs ffmpeg + log "generate subtitles (Chinese)" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + + python3 ./python-api-examples/generate-subtitles.py \ + --silero-vad-model=./silero_vad.onnx \ + --sense-voice=$repo/model.onnx \ + --tokens=$repo/tokens.txt \ + --num-threads=2 \ + ./lei-jun-test.wav + + cat lei-jun-test.srt + + rm lei-jun-test.wav + + log "generate subtitles (English)" + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav + + python3 ./python-api-examples/generate-subtitles.py \ + --silero-vad-model=./silero_vad.onnx \ + --sense-voice=$repo/model.onnx \ + --tokens=$repo/tokens.txt \ + --num-threads=2 \ + ./Obama.wav + + cat Obama.srt + rm Obama.wav + rm silero_vad.onnx +fi rm -rf $repo log "test offline TeleSpeech CTC" diff --git a/.github/workflows/run-python-test.yaml b/.github/workflows/run-python-test.yaml index c7433e0e..e0e88dc8 100644 --- a/.github/workflows/run-python-test.yaml +++ b/.github/workflows/run-python-test.yaml @@ -79,6 +79,11 @@ jobs: python3 -m pip install --upgrade pip numpy pypinyin sentencepiece>=0.1.96 soundfile python3 -m pip install wheel twine setuptools + - name: Install ffmpeg + shell: bash + run: | + sudo apt-get install ffmpeg + - name: Install ninja shell: bash run: | diff --git a/python-api-examples/generate-subtitles.py b/python-api-examples/generate-subtitles.py index 6488fce9..b39499e9 100755 --- a/python-api-examples/generate-subtitles.py +++ b/python-api-examples/generate-subtitles.py @@ -12,12 +12,12 @@ Supported file formats are those supported by ffmpeg; for instance, Note that you need a non-streaming model for this script. Please visit -https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx +https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx to download silero_vad.onnx For instance, -wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx (1) For paraformer @@ -58,7 +58,17 @@ wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/siler --num-threads=2 \ /path/to/test.mp4 -(4) For WeNet CTC models +(4) For SenseVoice CTC models + +./python-api-examples/generate-subtitles.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --num-threads=2 \ + /path/to/test.mp4 + + +(5) For WeNet CTC models ./python-api-examples/generate-subtitles.py \ --silero-vad-model=/path/to/silero_vad.onnx \ @@ -130,6 +140,13 @@ def get_args(): help="Path to the model.onnx from Paraformer", ) + parser.add_argument( + "--sense-voice", + default="", + type=str, + help="Path to the model.onnx from SenseVoice", + ) + parser.add_argument( "--wenet-ctc", default="", @@ -242,6 +259,7 @@ def assert_file_exists(filename: str): def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: if args.encoder: assert len(args.paraformer) == 0, args.paraformer + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.wenet_ctc) == 0, args.wenet_ctc assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder @@ -262,6 +280,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: debug=args.debug, ) elif args.paraformer: + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.wenet_ctc) == 0, args.wenet_ctc assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder @@ -277,6 +296,19 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: decoding_method=args.decoding_method, debug=args.debug, ) + elif args.sense_voice: + assert len(args.wenet_ctc) == 0, args.wenet_ctc + assert len(args.whisper_encoder) == 0, args.whisper_encoder + assert len(args.whisper_decoder) == 0, args.whisper_decoder + + assert_file_exists(args.sense_voice) + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice( + model=args.sense_voice, + tokens=args.tokens, + num_threads=args.num_threads, + use_itn=True, + debug=args.debug, + ) elif args.wenet_ctc: assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder @@ -406,6 +438,9 @@ def main(): vad.accept_waveform(buffer[:window_size]) buffer = buffer[window_size:] + if is_silence: + vad.flush() + streams = [] segments = [] while not vad.empty(): diff --git a/python-api-examples/non_streaming_server.py b/python-api-examples/non_streaming_server.py index d82c5c52..2194d6f5 100755 --- a/python-api-examples/non_streaming_server.py +++ b/python-api-examples/non_streaming_server.py @@ -92,6 +92,16 @@ python3 ./python-api-examples/non_streaming_server.py \ --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \ --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt +(6) Use a Non-streaming SenseVoice model + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + +python3 ./python-api-examples/non_streaming_server.py \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt + ---- To use a certificate so that you can use https, please use @@ -208,6 +218,15 @@ def add_paraformer_model_args(parser: argparse.ArgumentParser): ) +def add_sense_voice_model_args(parser: argparse.ArgumentParser): + parser.add_argument( + "--sense-voice", + default="", + type=str, + help="Path to the model.onnx from SenseVoice", + ) + + def add_nemo_ctc_model_args(parser: argparse.ArgumentParser): parser.add_argument( "--nemo-ctc", @@ -287,6 +306,7 @@ def add_whisper_model_args(parser: argparse.ArgumentParser): def add_model_args(parser: argparse.ArgumentParser): add_transducer_model_args(parser) add_paraformer_model_args(parser) + add_sense_voice_model_args(parser) add_nemo_ctc_model_args(parser) add_wenet_ctc_model_args(parser) add_tdnn_ctc_model_args(parser) @@ -850,6 +870,7 @@ def assert_file_exists(filename: str): def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: if args.encoder: assert len(args.paraformer) == 0, args.paraformer + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.nemo_ctc) == 0, args.nemo_ctc assert len(args.wenet_ctc) == 0, args.wenet_ctc assert len(args.whisper_encoder) == 0, args.whisper_encoder @@ -876,6 +897,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: provider=args.provider, ) elif args.paraformer: + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.nemo_ctc) == 0, args.nemo_ctc assert len(args.wenet_ctc) == 0, args.wenet_ctc assert len(args.whisper_encoder) == 0, args.whisper_encoder @@ -893,6 +915,20 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: decoding_method=args.decoding_method, provider=args.provider, ) + elif args.sense_voice: + assert len(args.nemo_ctc) == 0, args.nemo_ctc + assert len(args.wenet_ctc) == 0, args.wenet_ctc + assert len(args.whisper_encoder) == 0, args.whisper_encoder + assert len(args.whisper_decoder) == 0, args.whisper_decoder + assert len(args.tdnn_model) == 0, args.tdnn_model + + assert_file_exists(args.sense_voice) + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice( + model=args.sense_voice, + tokens=args.tokens, + num_threads=args.num_threads, + use_itn=True, + ) elif args.nemo_ctc: assert len(args.wenet_ctc) == 0, args.wenet_ctc assert len(args.whisper_encoder) == 0, args.whisper_encoder diff --git a/python-api-examples/offline-sense-voice-ctc-decode-files.py b/python-api-examples/offline-sense-voice-ctc-decode-files.py index b406288f..b6f55b47 100644 --- a/python-api-examples/offline-sense-voice-ctc-decode-files.py +++ b/python-api-examples/offline-sense-voice-ctc-decode-files.py @@ -22,7 +22,7 @@ import soundfile as sf def create_recognizer(): - model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx" + model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx" tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt" test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav" # test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav" diff --git a/python-api-examples/vad-with-non-streaming-asr.py b/python-api-examples/vad-with-non-streaming-asr.py index 0a22b9b8..7bb125d1 100755 --- a/python-api-examples/vad-with-non-streaming-asr.py +++ b/python-api-examples/vad-with-non-streaming-asr.py @@ -45,6 +45,14 @@ Note that you need a non-streaming model for this script. --whisper-task=transcribe \ --num-threads=2 +(4) For SenseVoice CTC models + +./python-api-examples/vad-with-non-streaming-asr.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --sense-voice=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx \ + --tokens=./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt \ + --num-threads=2 + Please refer to https://k2-fsa.github.io/sherpa/onnx/index.html to install sherpa-onnx and to download non-streaming pre-trained models @@ -123,6 +131,13 @@ def get_args(): help="Path to the model.onnx from Paraformer", ) + parser.add_argument( + "--sense-voice", + default="", + type=str, + help="Path to the model.onnx from SenseVoice", + ) + parser.add_argument( "--num-threads", type=int, @@ -233,6 +248,7 @@ def assert_file_exists(filename: str): def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: if args.encoder: assert len(args.paraformer) == 0, args.paraformer + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder @@ -253,6 +269,7 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: debug=args.debug, ) elif args.paraformer: + assert len(args.sense_voice) == 0, args.sense_voice assert len(args.whisper_encoder) == 0, args.whisper_encoder assert len(args.whisper_decoder) == 0, args.whisper_decoder @@ -267,6 +284,18 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer: decoding_method=args.decoding_method, debug=args.debug, ) + elif args.sense_voice: + assert len(args.whisper_encoder) == 0, args.whisper_encoder + assert len(args.whisper_decoder) == 0, args.whisper_decoder + + assert_file_exists(args.sense_voice) + recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice( + model=args.sense_voice, + tokens=args.tokens, + num_threads=args.num_threads, + use_itn=True, + debug=args.debug, + ) elif args.whisper_encoder: assert_file_exists(args.whisper_encoder) assert_file_exists(args.whisper_decoder)