Refactor hotwords,support loading hotwords from file (#296)
This commit is contained in:
@@ -326,6 +326,31 @@ def add_modified_beam_search_args(parser: argparse.ArgumentParser):
|
||||
)
|
||||
|
||||
|
||||
def add_hotwords_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
def check_args(args):
|
||||
if not Path(args.tokens).is_file():
|
||||
raise ValueError(f"{args.tokens} does not exist")
|
||||
@@ -342,6 +367,10 @@ def check_args(args):
|
||||
assert Path(args.decoder).is_file(), args.decoder
|
||||
assert Path(args.joiner).is_file(), args.joiner
|
||||
|
||||
if args.hotwords_file != "":
|
||||
assert args.decoding_method == "modified_beam_search", args.decoding_method
|
||||
assert Path(args.hotwords_file).is_file(), args.hotwords_file
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -351,6 +380,7 @@ def get_args():
|
||||
add_model_args(parser)
|
||||
add_feature_config_args(parser)
|
||||
add_decoding_args(parser)
|
||||
add_hotwords_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
@@ -792,6 +822,8 @@ def create_recognizer(args) -> sherpa_onnx.OfflineRecognizer:
|
||||
feature_dim=args.feat_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.max_active_paths,
|
||||
hotwords_file=args.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
)
|
||||
elif args.paraformer:
|
||||
assert len(args.nemo_ctc) == 0, args.nemo_ctc
|
||||
|
||||
@@ -82,7 +82,6 @@ from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import sentencepiece as spm
|
||||
import sherpa_onnx
|
||||
|
||||
|
||||
@@ -98,43 +97,25 @@ def get_args():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--bpe-model",
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
Path to bpe.model,
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--modeling-unit",
|
||||
type=str,
|
||||
default="char",
|
||||
help="""
|
||||
The type of modeling unit.
|
||||
Valid values are bpe, bpe+char, char.
|
||||
Note: the char here means characters in CJK languages.
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--contexts",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The context list, it is a string containing some words/phrases separated
|
||||
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--context-score",
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The context score of each token for biasing word/phrase. Used only if
|
||||
--contexts is given.
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -273,25 +254,6 @@ def assert_file_exists(filename: str):
|
||||
"https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
|
||||
)
|
||||
|
||||
|
||||
def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
|
||||
sp = None
|
||||
if "bpe" in args.modeling_unit:
|
||||
assert_file_exists(args.bpe_model)
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
tokens = {}
|
||||
with open(args.tokens, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
toks = line.strip().split()
|
||||
assert len(toks) == 2, len(toks)
|
||||
assert toks[0] not in tokens, f"Duplicate token: {toks} "
|
||||
tokens[toks[0]] = int(toks[1])
|
||||
return sherpa_onnx.encode_contexts(
|
||||
modeling_unit=args.modeling_unit, contexts=contexts, sp=sp, tokens_table=tokens
|
||||
)
|
||||
|
||||
|
||||
def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
|
||||
"""
|
||||
Args:
|
||||
@@ -322,7 +284,6 @@ def main():
|
||||
assert_file_exists(args.tokens)
|
||||
assert args.num_threads > 0, args.num_threads
|
||||
|
||||
contexts_list = []
|
||||
if args.encoder:
|
||||
assert len(args.paraformer) == 0, args.paraformer
|
||||
assert len(args.nemo_ctc) == 0, args.nemo_ctc
|
||||
@@ -330,11 +291,6 @@ def main():
|
||||
assert len(args.whisper_decoder) == 0, args.whisper_decoder
|
||||
assert len(args.tdnn_model) == 0, args.tdnn_model
|
||||
|
||||
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
|
||||
if contexts:
|
||||
print(f"Contexts list: {contexts}")
|
||||
contexts_list = encode_contexts(args, contexts)
|
||||
|
||||
assert_file_exists(args.encoder)
|
||||
assert_file_exists(args.decoder)
|
||||
assert_file_exists(args.joiner)
|
||||
@@ -348,7 +304,8 @@ def main():
|
||||
sample_rate=args.sample_rate,
|
||||
feature_dim=args.feature_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
context_score=args.context_score,
|
||||
hotwords_file=args.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
debug=args.debug,
|
||||
)
|
||||
elif args.paraformer:
|
||||
@@ -425,12 +382,7 @@ def main():
|
||||
samples, sample_rate = read_wave(wave_filename)
|
||||
duration = len(samples) / sample_rate
|
||||
total_duration += duration
|
||||
if contexts_list:
|
||||
assert len(args.paraformer) == 0, args.paraformer
|
||||
assert len(args.nemo_ctc) == 0, args.nemo_ctc
|
||||
s = recognizer.create_stream(contexts_list=contexts_list)
|
||||
else:
|
||||
s = recognizer.create_stream()
|
||||
s = recognizer.create_stream()
|
||||
s.accept_waveform(sample_rate, samples)
|
||||
|
||||
streams.append(s)
|
||||
|
||||
@@ -48,7 +48,6 @@ from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import sentencepiece as spm
|
||||
import sherpa_onnx
|
||||
|
||||
|
||||
@@ -124,46 +123,25 @@ def get_args():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--bpe-model",
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
Path to bpe.model, it will be used to tokenize contexts biasing phrases.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--modeling-unit",
|
||||
type=str,
|
||||
default="char",
|
||||
help="""
|
||||
The type of modeling unit, it will be used to tokenize contexts biasing phrases.
|
||||
Valid values are bpe, bpe+char, char.
|
||||
Note: the char here means characters in CJK languages.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--contexts",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The context list, it is a string containing some words/phrases separated
|
||||
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--context-score",
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The context score of each token for biasing word/phrase. Used only if
|
||||
--contexts is given.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -214,27 +192,6 @@ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
|
||||
return samples_float32, f.getframerate()
|
||||
|
||||
|
||||
def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
|
||||
sp = None
|
||||
if "bpe" in args.modeling_unit:
|
||||
assert_file_exists(args.bpe_model)
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
tokens = {}
|
||||
with open(args.tokens, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
toks = line.strip().split()
|
||||
assert len(toks) == 2, len(toks)
|
||||
assert toks[0] not in tokens, f"Duplicate token: {toks} "
|
||||
tokens[toks[0]] = int(toks[1])
|
||||
return sherpa_onnx.encode_contexts(
|
||||
modeling_unit=args.modeling_unit,
|
||||
contexts=contexts,
|
||||
sp=sp,
|
||||
tokens_table=tokens,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
assert_file_exists(args.tokens)
|
||||
@@ -258,7 +215,8 @@ def main():
|
||||
feature_dim=80,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.max_active_paths,
|
||||
context_score=args.context_score,
|
||||
hotwords_file=args.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
)
|
||||
elif args.paraformer_encoder:
|
||||
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
|
||||
@@ -277,12 +235,6 @@ def main():
|
||||
print("Started!")
|
||||
start_time = time.time()
|
||||
|
||||
contexts_list = []
|
||||
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
|
||||
if contexts:
|
||||
print(f"Contexts list: {contexts}")
|
||||
contexts_list = encode_contexts(args, contexts)
|
||||
|
||||
streams = []
|
||||
total_duration = 0
|
||||
for wave_filename in args.sound_files:
|
||||
@@ -291,10 +243,7 @@ def main():
|
||||
duration = len(samples) / sample_rate
|
||||
total_duration += duration
|
||||
|
||||
if contexts_list:
|
||||
s = recognizer.create_stream(contexts_list=contexts_list)
|
||||
else:
|
||||
s = recognizer.create_stream()
|
||||
s = recognizer.create_stream()
|
||||
|
||||
s.accept_waveform(sample_rate, samples)
|
||||
|
||||
|
||||
@@ -79,6 +79,30 @@ def get_args():
|
||||
help="Valid values: cpu, cuda, coreml",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -104,6 +128,8 @@ def create_recognizer(args):
|
||||
rule3_min_utterance_length=300, # it essentially disables this rule
|
||||
decoding_method=args.decoding_method,
|
||||
provider=args.provider,
|
||||
hotwords_file=agrs.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
)
|
||||
return recognizer
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import List
|
||||
import sentencepiece as spm
|
||||
|
||||
try:
|
||||
import sounddevice as sd
|
||||
@@ -90,49 +89,29 @@ def get_args():
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--bpe-model",
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
Path to bpe.model, it will be used to tokenize contexts biasing phrases.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--modeling-unit",
|
||||
type=str,
|
||||
default="char",
|
||||
help="""
|
||||
The type of modeling unit, it will be used to tokenize contexts biasing phrases.
|
||||
Valid values are bpe, bpe+char, char.
|
||||
Note: the char here means characters in CJK languages.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--contexts",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The context list, it is a string containing some words/phrases separated
|
||||
with /, for example, 'HELLO WORLD/I LOVE YOU/GO AWAY".
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--context-score",
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The context score of each token for biasing word/phrase. Used only if
|
||||
--contexts is given.
|
||||
Used only when --decoding-method=modified_beam_search
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -155,32 +134,12 @@ def create_recognizer(args):
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.max_active_paths,
|
||||
provider=args.provider,
|
||||
context_score=args.context_score,
|
||||
hotwords_file=args.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
)
|
||||
return recognizer
|
||||
|
||||
|
||||
def encode_contexts(args, contexts: List[str]) -> List[List[int]]:
|
||||
sp = None
|
||||
if "bpe" in args.modeling_unit:
|
||||
assert_file_exists(args.bpe_model)
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
tokens = {}
|
||||
with open(args.tokens, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
toks = line.strip().split()
|
||||
assert len(toks) == 2, len(toks)
|
||||
assert toks[0] not in tokens, f"Duplicate token: {toks} "
|
||||
tokens[toks[0]] = int(toks[1])
|
||||
return sherpa_onnx.encode_contexts(
|
||||
modeling_unit=args.modeling_unit,
|
||||
contexts=contexts,
|
||||
sp=sp,
|
||||
tokens_table=tokens,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
@@ -193,12 +152,6 @@ def main():
|
||||
default_input_device_idx = sd.default.device[0]
|
||||
print(f'Use default device: {devices[default_input_device_idx]["name"]}')
|
||||
|
||||
contexts_list = []
|
||||
contexts = [x.strip().upper() for x in args.contexts.split("/") if x.strip()]
|
||||
if contexts:
|
||||
print(f"Contexts list: {contexts}")
|
||||
contexts_list = encode_contexts(args, contexts)
|
||||
|
||||
recognizer = create_recognizer(args)
|
||||
print("Started! Please speak")
|
||||
|
||||
@@ -207,10 +160,7 @@ def main():
|
||||
sample_rate = 48000
|
||||
samples_per_read = int(0.1 * sample_rate) # 0.1 second = 100 ms
|
||||
last_result = ""
|
||||
if contexts_list:
|
||||
stream = recognizer.create_stream(contexts_list=contexts_list)
|
||||
else:
|
||||
stream = recognizer.create_stream()
|
||||
stream = recognizer.create_stream()
|
||||
with sd.InputStream(channels=1, dtype="float32", samplerate=sample_rate) as s:
|
||||
while True:
|
||||
samples, _ = s.read(samples_per_read) # a blocking read
|
||||
|
||||
@@ -87,6 +87,30 @@ def get_args():
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -107,6 +131,8 @@ def create_recognizer(args):
|
||||
rule1_min_trailing_silence=2.4,
|
||||
rule2_min_trailing_silence=1.2,
|
||||
rule3_min_utterance_length=300, # it essentially disables this rule
|
||||
hotwords_file=args.hotwords_file,
|
||||
hotwords_score=args.hotwords_score,
|
||||
)
|
||||
return recognizer
|
||||
|
||||
|
||||
@@ -187,6 +187,32 @@ def add_decoding_args(parser: argparse.ArgumentParser):
|
||||
add_modified_beam_search_args(parser)
|
||||
|
||||
|
||||
def add_hotwords_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--hotwords-file",
|
||||
type=str,
|
||||
default="",
|
||||
help="""
|
||||
The file containing hotwords, one words/phrases per line, and for each
|
||||
phrase the bpe/cjkchar are separated by a space. For example:
|
||||
|
||||
▁HE LL O ▁WORLD
|
||||
你 好 世 界
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--hotwords-score",
|
||||
type=float,
|
||||
default=1.5,
|
||||
help="""
|
||||
The hotword score of each token for biasing word/phrase. Used only if
|
||||
--hotwords-file is given.
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
|
||||
def add_modified_beam_search_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--num-active-paths",
|
||||
@@ -239,6 +265,7 @@ def get_args():
|
||||
add_model_args(parser)
|
||||
add_decoding_args(parser)
|
||||
add_endpointing_args(parser)
|
||||
add_hotwords_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
@@ -343,6 +370,8 @@ def create_recognizer(args) -> sherpa_onnx.OnlineRecognizer:
|
||||
feature_dim=args.feat_dim,
|
||||
decoding_method=args.decoding_method,
|
||||
max_active_paths=args.num_active_paths,
|
||||
hotwords_score=args.hotwords_score,
|
||||
hotwords_file=args.hotwords_file,
|
||||
enable_endpoint_detection=args.use_endpoint != 0,
|
||||
rule1_min_trailing_silence=args.rule1_min_trailing_silence,
|
||||
rule2_min_trailing_silence=args.rule2_min_trailing_silence,
|
||||
|
||||
Reference in New Issue
Block a user