Refactor hotwords，support loading hotwords from file (#296)

2023-09-14 19:33:17 +08:00
parent 087367d7fe
commit 47184f9db7
34 changed files with 803 additions and 300 deletions
--- a/python-api-examples/offline-decode-files.py
+++ b/python-api-examples/offline-decode-files.py
@@ -82,7 +82,6 @@ from pathlib import Path
 from typing import List, Tuple

 import numpy as np
-import sentencepiece as spm
 import sherpa_onnx


@@ -98,43 +97,25 @@ def get_args():
    )

    parser.add_argument(
-        "--bpe-model",
+        "--hotwords-file",
        type=str,
        default="",
        help="""
-        Path to bpe.model,
-        Used only when --decoding-method=modified_beam_search
+        The file containing hotwords, one words/phrases per line, and for each
+        phrase the bpe/cjkchar are separated by a space. For example:
+
+        ▁HE LL O ▁WORLD
+        你 好 世 界
        """,
    )

    parser.add_argument(
-        "--modeling-unit",
-        type=str,
-        default="char",
-        help="""
-        The type of modeling unit.
-        Valid values are bpe, bpe+char, char.
-        Note: the char here means characters in CJK languages.
-        """,
-    )
-
-    parser.add_argument(
-        "--contexts",
-        type=str,
-        default="",
-        help="""
-        The context list, it is a string containing some words/phrases separated
-        with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
-        """,
-    )
-
-    parser.add_argument(
-        "--context-score",
+        "--hotwords-score",
        type=float,
        default=1.5,
        help="""
-        The context score of each token for biasing word/phrase. Used only if
-        --contexts is given.
+        The hotword score of each token for biasing word/phrase. Used only if
+        --hotwords-file is given.
        """,
    )

@@ -273,25 +254,6 @@ def assert_file_exists(filename: str):
        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
    )

-
-def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
-    sp = None
-    if "bpe" in args.modeling_unit:
-        assert_file_exists(args.bpe_model)
-        sp = spm.SentencePieceProcessor()
-        sp.load(args.bpe_model)
-    tokens = {}
-    with open(args.tokens, "r", encoding="utf-8") as f:
-        for line in f:
-            toks = line.strip().split()
-            assert len(toks) == 2, len(toks)
-            assert toks[0] not in tokens, f"Duplicate token: {toks} "
-            tokens[toks[0]] = int(toks[1])
-    return sherpa_onnx.encode_contexts(
-        modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
-    )
-
-
 def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
    """
    Args:
@@ -322,7 +284,6 @@ def main():
    assert_file_exists(args.tokens)
    assert args.num_threads > 0, args.num_threads

-    contexts_list = []
    if args.encoder:
        assert len(args.paraformer) == 0, args.paraformer
        assert len(args.nemo_ctc) == 0, args.nemo_ctc
@@ -330,11 +291,6 @@ def main():
        assert len(args.whisper_decoder) == 0, args.whisper_decoder
        assert len(args.tdnn_model) == 0, args.tdnn_model

-        contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
-        if contexts:
-            print(f"Contexts list: {contexts}")
-            contexts_list = encode_contexts(args, contexts)
-
        assert_file_exists(args.encoder)
        assert_file_exists(args.decoder)
        assert_file_exists(args.joiner)
@@ -348,7 +304,8 @@ def main():
            sample_rate=args.sample_rate,
            feature_dim=args.feature_dim,
            decoding_method=args.decoding_method,
-            context_score=args.context_score,
+            hotwords_file=args.hotwords_file,
+            hotwords_score=args.hotwords_score,
            debug=args.debug,
        )
    elif args.paraformer:
@@ -425,12 +382,7 @@ def main():
        samples, sample_rate = read_wave(wave_filename)
        duration = len(samples) / sample_rate
        total_duration += duration
-        if contexts_list:
-            assert len(args.paraformer) == 0, args.paraformer
-            assert len(args.nemo_ctc) == 0, args.nemo_ctc
-            s = recognizer.create_stream(contexts_list=contexts_list)
-        else:
-            s = recognizer.create_stream()
+        s = recognizer.create_stream()
        s.accept_waveform(sample_rate, samples)

        streams.append(s)