Support real time hotwords on python (#230)

* support real time hotwords on python * fix comments
2023-08-03 15:50:11 +08:00
parent 8ab0090909
commit 801693a4d4
1 changed files with 89 additions and 2 deletions
--- a/python-api-examples/speech-recognition-from-microphone.py
+++ b/python-api-examples/speech-recognition-from-microphone.py
@@ -10,6 +10,9 @@ import argparse
 import sys
 from pathlib import Path

+from typing import List, Tuple
+import sentencepiece as spm
+
 try:
    import sounddevice as sd
 except ImportError:
@@ -70,6 +73,59 @@ def get_args():
        help="Valid values are greedy_search and modified_beam_search",
    )

+    parser.add_argument(
+        "--max-active-paths",
+        type=int,
+        default=4,
+        help="""Used only when --decoding-method is modified_beam_search.
+        It specifies number of active paths to keep during decoding.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="",
+        help="""
+        Path to bpe.model, it will be used to tokenize contexts biasing phrases.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--modeling-unit",
+        type=str,
+        default="char",
+        help="""
+        The type of modeling unit, it will be used to tokenize contexts biasing phrases.
+        Valid values are bpe, bpe+char, char.
+        Note: the char here means characters in CJK languages.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--contexts",
+        type=str,
+        default="",
+        help="""
+        The context list, it is a string containing some words/phrases separated
+        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--context-score",
+        type=float,
+        default=1.5,
+        help="""
+        The context score of each token for biasing word/phrase. Used only if
+        --contexts is given.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
    return parser.parse_args()


@@ -91,11 +147,40 @@ def create_recognizer():
        sample_rate=16000,
        feature_dim=80,
        decoding_method=args.decoding_method,
+        max_active_paths=args.max_active_paths,
+        context_score=args.context_score,
    )
    return recognizer

+def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
+    sp = None
+    if "bpe" in args.modeling_unit:
+        assert_file_exists(args.bpe_model)
+        sp = spm.SentencePieceProcessor()
+        sp.load(args.bpe_model)
+    tokens = {}
+    with open(args.tokens, "r", encoding="utf-8") as f:
+        for line in f:
+            toks = line.strip().split()
+            assert len(toks) == 2, len(toks)
+            assert toks[0] not in tokens, f"Duplicate token: {toks} "
+            tokens[toks[0]] = int(toks[1])
+    return sherpa_onnx.encode_contexts(
+        modeling_unit=args.modeling_unit,
+        contexts=contexts,
+        sp=sp,
+        tokens_table=tokens,
+    )

 def main():
+    args = get_args()
+
+    contexts_list = []
+    contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
+    if contexts:
+        print(f"Contexts list: {contexts}")
+        contexts_list = encode_contexts(args, contexts)
+
    recognizer = create_recognizer()
    print("Started! Please speak")

@@ -104,7 +189,10 @@ def main():
    sample_rate = 48000
    samples_per_read = int(0.1 * sample_rate)  # 0.1 second = 100 ms
    last_result = ""
-    stream = recognizer.create_stream()
+    if contexts_list:
+        stream = recognizer.create_stream(contexts_list=contexts_list)
+    else:
+        stream = recognizer.create_stream()
    with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
        while True:
            samples, _ = s.read(samples_per_read)  # a blocking read
@@ -117,7 +205,6 @@ def main():
                last_result = result
                print("\r{}".format(result), end="", flush=True)

-
 if __name__ == "__main__":
    devices = sd.query_devices()
    print(devices)