Implement context biasing with a Aho Corasick automata (#145)
* Implement context graph * Modify the interface to support context biasing * Support context biasing in modified beam search; add python wrapper * Support context biasing in python api example * Minor fixes * Fix context graph * Minor fixes * Fix tests * Fix style * Fix style * Fix comments * Minor fixes * Add missing header * Replace std::shared_ptr with std::unique_ptr for effciency * Build graph in constructor * Fix comments * Minor fixes * Fix docs
This commit is contained in:
@@ -43,9 +43,10 @@ import argparse
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import sentencepiece as spm
|
||||
import sherpa_onnx
|
||||
|
||||
|
||||
@@ -60,6 +61,47 @@ def get_args():
|
||||
help="Path to tokens.txt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--bpe-model",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
Path to bpe.model,
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--modeling-unit",
|
||||
type=str,
|
||||
default="char",
|
||||
help="""
|
||||
The type of modeling unit.
|
||||
Valid values are bpe, bpe+char, char.
|
||||
Note: the char here means characters in CJK languages.
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--contexts",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The context list, it is a string containing some words/phrases separated
|
||||
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--context-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The context score of each token for biasing word/phrase. Used only if
|
||||
--contexts is given.
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--encoder",
|
||||
default="",
|
||||
@@ -153,6 +195,24 @@ def assert_file_exists(filename: str):
|
||||
)
|
||||
|
||||
|
||||
def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
|
||||
sp = None
|
||||
if "bpe" in args.modeling_unit:
|
||||
assert_file_exists(args.bpe_model)
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
tokens = {}
|
||||
with open(args.tokens, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
toks = line.strip().split()
|
||||
assert len(toks) == 2, len(toks)
|
||||
assert toks[0] not in tokens, f"Duplicate token: {toks} "
|
||||
tokens[toks[0]] = int(toks[1])
|
||||
return sherpa_onnx.encode_contexts(
|
||||
modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
|
||||
)
|
||||
|
||||
|
||||
def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
|
||||
"""
|
||||
Args:
|
||||
@@ -182,10 +242,17 @@ def main():
|
||||
args = get_args()
|
||||
assert_file_exists(args.tokens)
|
||||
assert args.num_threads > 0, args.num_threads
|
||||
|
||||
contexts_list = []
|
||||
if args.encoder:
|
||||
assert len(args.paraformer) == 0, args.paraformer
|
||||
assert len(args.nemo_ctc) == 0, args.nemo_ctc
|
||||
|
||||
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
|
||||
if contexts:
|
||||
print(f"Contexts list: {contexts}")
|
||||
contexts_list = encode_contexts(args, contexts)
|
||||
|
||||
assert_file_exists(args.encoder)
|
||||
assert_file_exists(args.decoder)
|
||||
assert_file_exists(args.joiner)
|
||||
@@ -199,6 +266,7 @@ def main():
|
||||
sample_rate=args.sample_rate,
|
||||
feature_dim=args.feature_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
context_score=args.context_score,
|
||||
debug=args.debug,
|
||||
)
|
||||
elif args.paraformer:
|
||||
@@ -238,8 +306,12 @@ def main():
|
||||
samples, sample_rate = read_wave(wave_filename)
|
||||
duration = len(samples) / sample_rate
|
||||
total_duration += duration
|
||||
|
||||
s = recognizer.create_stream()
|
||||
if contexts_list:
|
||||
assert len(args.paraformer) == 0, args.paraformer
|
||||
assert len(args.nemo_ctc) == 0, args.nemo_ctc
|
||||
s = recognizer.create_stream(contexts_list=contexts_list)
|
||||
else:
|
||||
s = recognizer.create_stream()
|
||||
s.accept_waveform(sample_rate, samples)
|
||||
|
||||
streams.append(s)
|
||||
|
||||
Reference in New Issue
Block a user