Refactor hotwords,support loading hotwords from file (#296)

This commit is contained in:
Wei Kang
2023-09-14 19:33:17 +08:00
committed by GitHub
parent 087367d7fe
commit 47184f9db7
34 changed files with 803 additions and 300 deletions

View File

@@ -48,7 +48,6 @@ from pathlib import Path
from typing import List, Tuple
import numpy as np
import sentencepiece as spm
import sherpa_onnx
@@ -124,46 +123,25 @@ def get_args():
)
parser.add_argument(
"--bpe-model",
"--hotwords-file",
type=str,
default="",
help="""
Path to bpe.model, it will be used to tokenize contexts biasing phrases.
Used only when --decoding-method=modified_beam_search
The file containing hotwords, one words/phrases per line, and for each
phrase the bpe/cjkchar are separated by a space. For example:
▁HE LL O ▁WORLD
你 好 世 界
""",
)
parser.add_argument(
"--modeling-unit",
type=str,
default="char",
help="""
The type of modeling unit, it will be used to tokenize contexts biasing phrases.
Valid values are bpe, bpe+char, char.
Note: the char here means characters in CJK languages.
Used only when --decoding-method=modified_beam_search
""",
)
parser.add_argument(
"--contexts",
type=str,
default="",
help="""
The context list, it is a string containing some words/phrases separated
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
Used only when --decoding-method=modified_beam_search
""",
)
parser.add_argument(
"--context-score",
"--hotwords-score",
type=float,
default=1.5,
help="""
The context score of each token for biasing word/phrase. Used only if
--contexts is given.
Used only when --decoding-method=modified_beam_search
The hotword score of each token for biasing word/phrase. Used only if
--hotwords-file is given.
""",
)
@@ -214,27 +192,6 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
return samples_float32, f.getframerate()
def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
sp = None
if "bpe" in args.modeling_unit:
assert_file_exists(args.bpe_model)
sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model)
tokens = {}
with open(args.tokens, "r", encoding="utf-8") as f:
for line in f:
toks = line.strip().split()
assert len(toks) == 2, len(toks)
assert toks[0] not in tokens, f"Duplicate token: {toks} "
tokens[toks[0]] = int(toks[1])
return sherpa_onnx.encode_contexts(
modeling_unit=args.modeling_unit,
contexts=contexts,
sp=sp,
tokens_table=tokens,
)
def main():
args = get_args()
assert_file_exists(args.tokens)
@@ -258,7 +215,8 @@ def main():
feature_dim=80,
decoding_method=args.decoding_method,
max_active_paths=args.max_active_paths,
context_score=args.context_score,
hotwords_file=args.hotwords_file,
hotwords_score=args.hotwords_score,
)
elif args.paraformer_encoder:
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
@@ -277,12 +235,6 @@ def main():
print("Started!")
start_time = time.time()
contexts_list = []
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
if contexts:
print(f"Contexts list: {contexts}")
contexts_list = encode_contexts(args, contexts)
streams = []
total_duration = 0
for wave_filename in args.sound_files:
@@ -291,10 +243,7 @@ def main():
duration = len(samples) / sample_rate
total_duration += duration
if contexts_list:
s = recognizer.create_stream(contexts_list=contexts_list)
else:
s = recognizer.create_stream()
s = recognizer.create_stream()
s.accept_waveform(sample_rate, samples)