Implement context biasing with a Aho Corasick automata (#145)

* Implement context graph * Modify the interface to support context biasing * Support context biasing in modified beam search; add python wrapper * Support context biasing in python api example * Minor fixes * Fix context graph * Minor fixes * Fix tests * Fix style * Fix style * Fix comments * Minor fixes * Add missing header * Replace std::shared_ptr with std::unique_ptr for effciency * Build graph in constructor * Fix comments * Minor fixes * Fix docs
2023-06-16 14:26:36 +08:00
parent 1a1b9fd236
commit 8562711252
23 changed files with 515 additions and 29 deletions
--- a/python-api-examples/offline-decode-files.py
+++ b/python-api-examples/offline-decode-files.py
@@ -43,9 +43,10 @@ import argparse
 import time
 import wave
 from pathlib import Path
-from typing import Tuple
+from typing import List, Tuple

 import numpy as np
+import sentencepiece as spm
 import sherpa_onnx


@@ -60,6 +61,47 @@ def get_args():
        help="Path to tokens.txt",
    )

+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="",
+        help="""
+        Path to bpe.model,
+        Used only when --decoding-method=modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--modeling-unit",
+        type=str,
+        default="char",
+        help="""
+        The type of modeling unit.
+        Valid values are bpe, bpe+char, char.
+        Note: the char here means characters in CJK languages.
+        """,
+    )
+
+    parser.add_argument(
+        "--contexts",
+        type=str,
+        default="",
+        help="""
+        The context list, it is a string containing some words/phrases separated
+        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
+        """,
+    )
+
+    parser.add_argument(
+        "--context-score",
+        type=float,
+        default=1.5,
+        help="""
+        The context score of each token for biasing word/phrase. Used only if
+        --contexts is given.
+        """,
+    )
+
    parser.add_argument(
        "--encoder",
        default="",
@@ -153,6 +195,24 @@ def assert_file_exists(filename: str):
    )


+def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
+    sp = None
+    if "bpe" in args.modeling_unit:
+        assert_file_exists(args.bpe_model)
+        sp = spm.SentencePieceProcessor()
+        sp.load(args.bpe_model)
+    tokens = {}
+    with open(args.tokens, "r", encoding="utf-8") as f:
+        for line in f:
+            toks = line.strip().split()
+            assert len(toks) == 2, len(toks)
+            assert toks[0] not in tokens, f"Duplicate token: {toks} "
+            tokens[toks[0]] = int(toks[1])
+    return sherpa_onnx.encode_contexts(
+        modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
+    )
+
+
 def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
    """
    Args:
@@ -182,10 +242,17 @@ def main():
    args = get_args()
    assert_file_exists(args.tokens)
    assert args.num_threads > 0, args.num_threads
+
+    contexts_list = []
    if args.encoder:
        assert len(args.paraformer) == 0, args.paraformer
        assert len(args.nemo_ctc) == 0, args.nemo_ctc

+        contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
+        if contexts:
+            print(f"Contexts list: {contexts}")
+            contexts_list = encode_contexts(args, contexts)
+
        assert_file_exists(args.encoder)
        assert_file_exists(args.decoder)
        assert_file_exists(args.joiner)
@@ -199,6 +266,7 @@ def main():
            sample_rate=args.sample_rate,
            feature_dim=args.feature_dim,
            decoding_method=args.decoding_method,
+            context_score=args.context_score,
            debug=args.debug,
        )
    elif args.paraformer:
@@ -238,8 +306,12 @@ def main():
        samples, sample_rate = read_wave(wave_filename)
        duration = len(samples) / sample_rate
        total_duration += duration
-
-        s = recognizer.create_stream()
+        if contexts_list:
+            assert len(args.paraformer) == 0, args.paraformer
+            assert len(args.nemo_ctc) == 0, args.nemo_ctc
+            s = recognizer.create_stream(contexts_list=contexts_list)
+        else:
+            s = recognizer.create_stream()
        s.accept_waveform(sample_rate, samples)

        streams.append(s)