Support streaming paraformer (#263)
This commit is contained in:
@@ -37,14 +37,14 @@ python3 ./python-api-examples/non_streaming_server.py \
|
||||
(2) Use a non-streaming paraformer
|
||||
|
||||
cd /path/to/sherpa-onnx
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28
|
||||
cd sherpa-onnx-paraformer-zh-2023-03-28
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-bilingual-zh-en
|
||||
cd sherpa-onnx-paraformer-bilingual-zh-en/
|
||||
git lfs pull --include "*.onnx"
|
||||
cd ..
|
||||
|
||||
python3 ./python-api-examples/non_streaming_server.py \
|
||||
--paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \
|
||||
--tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt
|
||||
--paraformer ./sherpa-onnx-paraformer-bilingual-zh-en/model.int8.onnx \
|
||||
--tokens ./sherpa-onnx-paraformer-bilingual-zh-en/tokens.txt
|
||||
|
||||
(3) Use a non-streaming CTC model from NeMo
|
||||
|
||||
|
||||
@@ -5,16 +5,41 @@ This file demonstrates how to use sherpa-onnx Python API to transcribe
|
||||
file(s) with a streaming model.
|
||||
|
||||
Usage:
|
||||
./online-decode-files.py \
|
||||
/path/to/foo.wav \
|
||||
/path/to/bar.wav \
|
||||
/path/to/16kHz.wav \
|
||||
/path/to/8kHz.wav
|
||||
|
||||
(1) Streaming transducer
|
||||
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-06-26
|
||||
cd sherpa-onnx-streaming-zipformer-en-2023-06-26
|
||||
git lfs pull --include "*.onnx"
|
||||
|
||||
./python-api-examples/online-decode-files.py \
|
||||
--tokens=./sherpa-onnx-streaming-zipformer-en-2023-06-26/tokens.txt \
|
||||
--encoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/encoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
--decoder=./sherpa-onnx-streaming-zipformer-en-2023-06-26/decoder-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
--joiner=./sherpa-onnx-streaming-zipformer-en-2023-06-26/joiner-epoch-99-avg-1-chunk-16-left-64.onnx \
|
||||
./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/0.wav \
|
||||
./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/1.wav \
|
||||
./sherpa-onnx-streaming-zipformer-en-2023-06-26/test_wavs/8k.wav
|
||||
|
||||
(2) Streaming paraformer
|
||||
|
||||
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-paraformer-bilingual-zh-en
|
||||
cd sherpa-onnx-streaming-paraformer-bilingual-zh-en
|
||||
git lfs pull --include "*.onnx"
|
||||
|
||||
./python-api-examples/online-decode-files.py \
|
||||
--tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \
|
||||
--paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \
|
||||
--paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx \
|
||||
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav \
|
||||
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/1.wav \
|
||||
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/2.wav \
|
||||
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/3.wav \
|
||||
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/8k.wav
|
||||
|
||||
Please refer to
|
||||
https://k2-fsa.github.io/sherpa/onnx/index.html
|
||||
to install sherpa-onnx and to download the pre-trained models
|
||||
used in this file.
|
||||
to install sherpa-onnx and to download streaming pre-trained models.
|
||||
"""
|
||||
import argparse
|
||||
import time
|
||||
@@ -41,19 +66,31 @@ def get_args():
|
||||
parser.add_argument(
|
||||
"--encoder",
|
||||
type=str,
|
||||
help="Path to the encoder model",
|
||||
help="Path to the transducer encoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--decoder",
|
||||
type=str,
|
||||
help="Path to the decoder model",
|
||||
help="Path to the transducer decoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--joiner",
|
||||
type=str,
|
||||
help="Path to the joiner model",
|
||||
help="Path to the transducer joiner model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--paraformer-encoder",
|
||||
type=str,
|
||||
help="Path to the paraformer encoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--paraformer-decoder",
|
||||
type=str,
|
||||
help="Path to the paraformer decoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -200,24 +237,42 @@ def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
assert_file_exists(args.encoder)
|
||||
assert_file_exists(args.decoder)
|
||||
assert_file_exists(args.joiner)
|
||||
assert_file_exists(args.tokens)
|
||||
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.encoder,
|
||||
decoder=args.decoder,
|
||||
joiner=args.joiner,
|
||||
num_threads=args.num_threads,
|
||||
provider=args.provider,
|
||||
sample_rate=16000,
|
||||
feature_dim=80,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.max_active_paths,
|
||||
context_score=args.context_score,
|
||||
)
|
||||
if args.encoder:
|
||||
assert_file_exists(args.encoder)
|
||||
assert_file_exists(args.decoder)
|
||||
assert_file_exists(args.joiner)
|
||||
|
||||
assert not args.paraformer_encoder, args.paraformer_encoder
|
||||
assert not args.paraformer_decoder, args.paraformer_decoder
|
||||
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.encoder,
|
||||
decoder=args.decoder,
|
||||
joiner=args.joiner,
|
||||
num_threads=args.num_threads,
|
||||
provider=args.provider,
|
||||
sample_rate=16000,
|
||||
feature_dim=80,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.max_active_paths,
|
||||
context_score=args.context_score,
|
||||
)
|
||||
elif args.paraformer_encoder:
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.paraformer_encoder,
|
||||
decoder=args.paraformer_decoder,
|
||||
num_threads=args.num_threads,
|
||||
provider=args.provider,
|
||||
sample_rate=16000,
|
||||
feature_dim=80,
|
||||
decoding_method="greedy_search",
|
||||
)
|
||||
else:
|
||||
raise ValueError("Please provide a model")
|
||||
|
||||
print("Started!")
|
||||
start_time = time.time()
|
||||
@@ -243,7 +298,7 @@ def main():
|
||||
|
||||
s.accept_waveform(sample_rate, samples)
|
||||
|
||||
tail_paddings = np.zeros(int(0.2 * sample_rate), dtype=np.float32)
|
||||
tail_paddings = np.zeros(int(0.66 * sample_rate), dtype=np.float32)
|
||||
s.accept_waveform(sample_rate, tail_paddings)
|
||||
|
||||
s.input_finished()
|
||||
|
||||
@@ -16,9 +16,9 @@ Example:
|
||||
(1) Without a certificate
|
||||
|
||||
python3 ./python-api-examples/streaming_server.py \
|
||||
--encoder-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \
|
||||
--encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt
|
||||
|
||||
(2) With a certificate
|
||||
@@ -32,9 +32,9 @@ python3 ./python-api-examples/streaming_server.py \
|
||||
(b) Start the server
|
||||
|
||||
python3 ./python-api-examples/streaming_server.py \
|
||||
--encoder-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner-model ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \
|
||||
--encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx \
|
||||
--decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \
|
||||
--joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx \
|
||||
--tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \
|
||||
--certificate ./python-api-examples/web/cert.pem
|
||||
|
||||
@@ -113,24 +113,33 @@ def setup_logger(
|
||||
|
||||
def add_model_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--encoder-model",
|
||||
"--encoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the encoder model",
|
||||
help="Path to the transducer encoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--decoder-model",
|
||||
"--decoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the decoder model.",
|
||||
help="Path to the transducer decoder model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--joiner-model",
|
||||
"--joiner",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the joiner model.",
|
||||
help="Path to the transducer joiner model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--paraformer-encoder",
|
||||
type=str,
|
||||
help="Path to the paraformer encoder model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--paraformer-decoder",
|
||||
type=str,
|
||||
help="Path to the transducer decoder model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -323,22 +332,40 @@ def get_args():
|
||||
|
||||
|
||||
def create_recognizer(args) -> sherpa_onnx.OnlineRecognizer:
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.encoder_model,
|
||||
decoder=args.decoder_model,
|
||||
joiner=args.joiner_model,
|
||||
num_threads=args.num_threads,
|
||||
sample_rate=args.sample_rate,
|
||||
feature_dim=args.feat_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.num_active_paths,
|
||||
enable_endpoint_detection=args.use_endpoint != 0,
|
||||
rule1_min_trailing_silence=args.rule1_min_trailing_silence,
|
||||
rule2_min_trailing_silence=args.rule2_min_trailing_silence,
|
||||
rule3_min_utterance_length=args.rule3_min_utterance_length,
|
||||
provider=args.provider,
|
||||
)
|
||||
if args.encoder:
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.encoder,
|
||||
decoder=args.decoder,
|
||||
joiner=args.joiner,
|
||||
num_threads=args.num_threads,
|
||||
sample_rate=args.sample_rate,
|
||||
feature_dim=args.feat_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.num_active_paths,
|
||||
enable_endpoint_detection=args.use_endpoint != 0,
|
||||
rule1_min_trailing_silence=args.rule1_min_trailing_silence,
|
||||
rule2_min_trailing_silence=args.rule2_min_trailing_silence,
|
||||
rule3_min_utterance_length=args.rule3_min_utterance_length,
|
||||
provider=args.provider,
|
||||
)
|
||||
elif args.paraformer_encoder:
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
|
||||
tokens=args.tokens,
|
||||
encoder=args.paraformer_encoder,
|
||||
decoder=args.paraformer_decoder,
|
||||
num_threads=args.num_threads,
|
||||
sample_rate=args.sample_rate,
|
||||
feature_dim=args.feat_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
enable_endpoint_detection=args.use_endpoint != 0,
|
||||
rule1_min_trailing_silence=args.rule1_min_trailing_silence,
|
||||
rule2_min_trailing_silence=args.rule2_min_trailing_silence,
|
||||
rule3_min_utterance_length=args.rule3_min_utterance_length,
|
||||
provider=args.provider,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Please provide a model")
|
||||
|
||||
return recognizer
|
||||
|
||||
@@ -654,11 +681,25 @@ Go back to <a href="/streaming_record.html">/streaming_record.html</a>
|
||||
|
||||
|
||||
def check_args(args):
|
||||
assert Path(args.encoder_model).is_file(), f"{args.encoder_model} does not exist"
|
||||
if args.encoder:
|
||||
assert Path(args.encoder).is_file(), f"{args.encoder} does not exist"
|
||||
|
||||
assert Path(args.decoder_model).is_file(), f"{args.decoder_model} does not exist"
|
||||
assert Path(args.decoder).is_file(), f"{args.decoder} does not exist"
|
||||
|
||||
assert Path(args.joiner_model).is_file(), f"{args.joiner_model} does not exist"
|
||||
assert Path(args.joiner).is_file(), f"{args.joiner} does not exist"
|
||||
|
||||
assert args.paraformer_encoder is None, args.paraformer_encoder
|
||||
assert args.paraformer_decoder is None, args.paraformer_decoder
|
||||
elif args.paraformer_encoder:
|
||||
assert Path(
|
||||
args.paraformer_encoder
|
||||
).is_file(), f"{args.paraformer_encoder} does not exist"
|
||||
|
||||
assert Path(
|
||||
args.paraformer_decoder
|
||||
).is_file(), f"{args.paraformer_decoder} does not exist"
|
||||
else:
|
||||
raise ValueError("Please provide a model")
|
||||
|
||||
if not Path(args.tokens).is_file():
|
||||
raise ValueError(f"{args.tokens} does not exist")
|
||||
|
||||
Reference in New Issue
Block a user