Support contextual-biasing for streaming model (#184)

* Support contextual-biasing for streaming model * The whole pipeline runs normally * Fix comments
2023-06-30 16:46:24 +08:00
parent b2e0c4c9c2
commit 513dfaa552
10 changed files with 238 additions and 22 deletions
--- a/python-api-examples/online-decode-files.py
+++ b/python-api-examples/online-decode-files.py
@@ -20,9 +20,10 @@ import argparse
 import time
 import wave
 from pathlib import Path
-from typing import Tuple
+from typing import List, Tuple

 import numpy as np
+import sentencepiece as spm
 import sherpa_onnx


@@ -69,6 +70,59 @@ def get_args():
        help="Valid values are greedy_search and modified_beam_search",
    )

+    parser.add_argument(
+        "--max-active-paths",
+        type=int,
+        default=4,
+        help="""Used only when --decoding-method is modified_beam_search.
+        It specifies number of active paths to keep during decoding.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="",
+        help="""
+        Path to bpe.model, it will be used to tokenize contexts biasing phrases.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--modeling-unit",
+        type=str,
+        default="char",
+        help="""
+        The type of modeling unit, it will be used to tokenize contexts biasing phrases.
+        Valid values are bpe, bpe+char, char.
+        Note: the char here means characters in CJK languages.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--contexts",
+        type=str,
+        default="",
+        help="""
+        The context list, it is a string containing some words/phrases separated
+        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--context-score",
+        type=float,
+        default=1.5,
+        help="""
+        The context score of each token for biasing word/phrase. Used only if
+        --contexts is given.
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
    parser.add_argument(
        "sound_files",
        type=str,
@@ -116,6 +170,27 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
        return samples_float32, f.getframerate()


+def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
+    sp = None
+    if "bpe" in args.modeling_unit:
+        assert_file_exists(args.bpe_model)
+        sp = spm.SentencePieceProcessor()
+        sp.load(args.bpe_model)
+    tokens = {}
+    with open(args.tokens, "r", encoding="utf-8") as f:
+        for line in f:
+            toks = line.strip().split()
+            assert len(toks) == 2, len(toks)
+            assert toks[0] not in tokens, f"Duplicate token: {toks} "
+            tokens[toks[0]] = int(toks[1])
+    return sherpa_onnx.encode_contexts(
+        modeling_unit=args.modeling_unit,
+        contexts=contexts,
+        sp=sp,
+        tokens_table=tokens,
+    )
+
+
 def main():
    args = get_args()
    assert_file_exists(args.encoder)
@@ -132,11 +207,20 @@ def main():
        sample_rate=16000,
        feature_dim=80,
        decoding_method=args.decoding_method,
+        max_active_paths=args.max_active_paths,
+        context_score=args.context_score,
    )

    print("Started!")
    start_time = time.time()

+    contexts_list = []
+    contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
+    if contexts:
+        print(f"Contexts list: {contexts}")
+        contexts_list = encode_contexts(args, contexts)
+
+
    streams = []
    total_duration = 0
    for wave_filename in args.sound_files:
@@ -145,7 +229,11 @@ def main():
        duration = len(samples) / sample_rate
        total_duration += duration

-        s = recognizer.create_stream()
+        if contexts_list:
+            s = recognizer.create_stream(contexts_list=contexts_list)
+        else:
+            s = recognizer.create_stream()
+
        s.accept_waveform(sample_rate, samples)

        tail_paddings = np.zeros(int(0.2 * sample_rate), dtype=np.float32)